diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 00000000..bc4bfa46 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,15 @@ +name: pre-commit + +on: + pull_request: + push: + branches: + - master + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4.3.0 + - uses: pre-commit/action@v3.0.0 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..81743682 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,22 @@ +repos: +- repo: https://github.com/pre-commit/mirrors-yapf # To format the code to conform YAPF + rev: v0.31.0 + hooks: + - id: yapf + args: ['--in-place', '--recursive', '--style', 'google'] + +- repo: https://github.com/myint/docformatter # To format the doc strings to conform PEP257 + rev: v1.4 + hooks: + - id: docformatter + args: [--in-place] + +- repo: https://github.com/pre-commit/pre-commit-hooks # Some common pre-commit hooks + rev: v3.4.0 + hooks: + - id: check-yaml # Checks the syntax of .yaml files. + args: [--allow-multiple-documents] + exclude: 'meta.yaml' # Exclude this because it gives an error for '%' in Line 1 and couldn't fix yet + - id: end-of-file-fixer # Makes sure files end with a newline. + - id: trailing-whitespace # Checks for any tabs or spaces after the last non-whitespace character on the line. + - id: check-docstring-first # Checks that code comes after the docstrings. diff --git a/README.adoc b/README.adoc index 12f1ce98..d7283c10 100644 --- a/README.adoc +++ b/README.adoc @@ -1 +1,39 @@ See the https://ecp-candle.github.io/Supervisor/home.html[Home Page] for more information. + +# Running the feature domain based comparison + +- Create the CANDLE_DATA_DIR. Place drug_features.csv in the CANDLE_DATA_DIR + - drug_features.csv shoulld contain the drug features of at least the test set drug molecules +- The paths of the model's directories have to be added to the PYTHONPATH in workflow.sh +- Start the run using the command ./test-small-1.sh SITE, where SITE is the name of the computing system. test-small-1.sh is at workflows/cmp-cv/test +- upf-1.txt is used as the input file to specify the model hyperparameters as well as the model name and candle_image location. + +``` +{"id": "RUN000", "epochs": 1, "model_name": "DrugCell", "candle_image": "/path/to/sif/DrugCell.sif"} +{"id": "RUN001", "epochs": 2, "model_name": "DrugCell", "candle_image": "/path/to/sif/DrugCell.sif"} +{"id": "RUN002", "epochs": 1, "model_name": "SWnet_CCLE", "candle_image": "/path/to/sif/SWnet.sif"} +{"id": "RUN003", "epochs": 2, "model_name": "SWnet_CCLE", "candle_image": "/path/to/sif/SWnet.sif"} +``` + +### Running the specific example at workflows/cmp-cv/test + +- Clone Supervisor from https://github.com/ECP-CANDLE/Supervisor +- Clone the DrugCell and SWnet model directories from https://github.com/gihanpanapitiya/DrugCell/tree/to_candle and https://github.com/gihanpanapitiya/SWnet/tree/to_candle + - Checkout to_candle branches and create the Singularity containers (.sif files) using the command, + + ``` + singularity build --fakeroot /path/for/sif/DerugCell.sif /path/to/DrugCell.def + singularity build --fakeroot /path/for/sif/SWnet.sif /path/to/SWnet.def + ``` + +- Add /path/for/sif/DerugCell.sif and /path/for/sif/SWnet.sif to the PYTHONPATH in workflow.sh +- Create the CANDLE_DATA_DIR. Place drug_features.csv in the CANDLE_DATA_DIR +- Run the command ./test-small-1.sh SITE + + +#### Known issues + +- some input files required for analysis have to be manually added to candle data dir +- outputs get written to 'experiments' not CANDLE_DATA_DIR +- python paths have to be explicitly specified in workflow.sh +- singularity container is not being used even though the CANDLE_MODEL_TYPE=SINGULARITY is specified diff --git a/archives/py-loc/p.swift b/archives/py-loc/p.swift index c6803a9e..67255e9e 100644 --- a/archives/py-loc/p.swift +++ b/archives/py-loc/p.swift @@ -5,11 +5,11 @@ import location; L0 = locationFromRank(0); L1 = locationFromRank(1); - + @location=L0 python_persist("L = []"); @location=L1 python_persist("L = []"); -string D[]; -foreach j in [0:9] { +string D[]; +foreach j in [0:9] { L = locationFromRank(j%%2); D[j] = @location=L python_persist("L.append(repr(2+%i)) " % j); } diff --git a/archives/templates/README.md b/archives/templates/README.md index fc271770..74b1a745 100644 --- a/archives/templates/README.md +++ b/archives/templates/README.md @@ -16,17 +16,17 @@ In more detail, here are the steps required for running an arbitrary workflow on 1. Ensure the `$SITE` and `$CANDLE` variables are exported to the environment as specified [here](#CANDLE-settings-at-different-SITEs). 1. Copy the submission script `$CANDLE/Supervisor/templates/submit_candle_job.sh` to a working directory. 1. Specify the model in the submission script: - 1. Set the `$MODEL_PYTHON_SCRIPT` variable to one of the models in the `$CANDLE/Supervisor/templates/models` directory (currently either "resnet", "unet", "uno", or "mnist_mlp"). Or, specify your own [CANDLE-compliant](https://ecp-candle.github.io/Candle/html/tutorials/writing_candle_code.html) Python model by setting both the `$MODEL_PYTHON_DIR` and `$MODEL_PYTHON_SCRIPT` variables as appropriate. - 1. Specify the corresponding default model parameters by setting the `$DEFAULT_PARAMS_FILE` variable to one of the files in the `$CANDLE/Supervisor/templates/model_params` directory. Or, copy one of these template files to the working directory, modify it accordingly, and point the `$DEFAULT_PARAMS_FILE` variable to this file. + 1. Set the `$MODEL_PYTHON_SCRIPT` variable to one of the models in the `$CANDLE/Supervisor/templates/models` directory (currently either "resnet", "unet", "uno", or "mnist_mlp"). Or, specify your own [CANDLE-compliant](https://ecp-candle.github.io/Candle/html/tutorials/writing_candle_code.html) Python model by setting both the `$MODEL_PYTHON_DIR` and `$MODEL_PYTHON_SCRIPT` variables as appropriate. + 1. Specify the corresponding default model parameters by setting the `$DEFAULT_PARAMS_FILE` variable to one of the files in the `$CANDLE/Supervisor/templates/model_params` directory. Or, copy one of these template files to the working directory, modify it accordingly, and point the `$DEFAULT_PARAMS_FILE` variable to this file. 1. Specify the workflow in the submission script: 1. Set the `$WORKFLOW_TYPE` variable as appropriate (currently supported are "upf", and, to a less-tested extent, "mlrMBO"). - 1. Specify the corresponding workflow settings by setting the `$WORKFLOW_SETTINGS_FILE` variable to one of the files in the `$CANDLE/Supervisor/templates/workflow_settings` directory. Or, copy one of these template files to the working directory, modify it accordingly, and point the `$WORKFLOW_SETTINGS_FILE` variable to this file. + 1. Specify the corresponding workflow settings by setting the `$WORKFLOW_SETTINGS_FILE` variable to one of the files in the `$CANDLE/Supervisor/templates/workflow_settings` directory. Or, copy one of these template files to the working directory, modify it accordingly, and point the `$WORKFLOW_SETTINGS_FILE` variable to this file. 1. Adjust any other variables in the submission script such as the output directory (specified by `$EXPERIMENTS`), the scheduler settings, etc. 1. Run the script from a submit node like `./submit_candle_job.sh`. ## Background -In general, it would be nice to allow for an arbitrary model (U-Net, ResNet, etc.) to be run using an arbitrary workflow (UPF, mlrMBO, etc.), all in an external working directory. For example, here is a sample submission script: +In general, it would be nice to allow for an arbitrary model (U-Net, ResNet, etc.) to be run using an arbitrary workflow (UPF, mlrMBO, etc.), all in an external working directory. For example, here is a sample submission script: ```bash #!/bin/bash @@ -60,13 +60,13 @@ export WORKFLOW_SETTINGS_FILE="/home/weismanal/notebook/2019-02-28/unet/upf1.txt $CANDLE/Supervisor/workflows/$WORKFLOW_TYPE/swift/workflow.sh $SITE -a $CANDLE/Supervisor/workflows/common/sh/cfg-sys-$SITE.sh $WORKFLOW_SETTINGS_FILE ``` -When this script is run (no arguments accepted) on a Biowulf submit node, the necessarily [CANDLE-compliant](https://ecp-candle.github.io/Candle/html/tutorials/writing_candle_code.html) file `$MODEL_PYTHON_DIR/$MODEL_PYTHON_SCRIPT.py` will be run using the default parameters specified in `$DEFAULT_PARAMS_FILE`. The CANDLE workflow used will be UPF (specified by `$WORKFLOW_TYPE`) and will be run using the parameters specified in `$WORKFLOW_SETTINGS_FILE`. The results of the job will be output in `$EXPERIMENTS`. Note that we can choose a different workflow by simply changing the value of the `$WORKFLOW_TYPE` variable, e.g., +When this script is run (no arguments accepted) on a Biowulf submit node, the necessarily [CANDLE-compliant](https://ecp-candle.github.io/Candle/html/tutorials/writing_candle_code.html) file `$MODEL_PYTHON_DIR/$MODEL_PYTHON_SCRIPT.py` will be run using the default parameters specified in `$DEFAULT_PARAMS_FILE`. The CANDLE workflow used will be UPF (specified by `$WORKFLOW_TYPE`) and will be run using the parameters specified in `$WORKFLOW_SETTINGS_FILE`. The results of the job will be output in `$EXPERIMENTS`. Note that we can choose a different workflow by simply changing the value of the `$WORKFLOW_TYPE` variable, e.g., ```bash export WORKFLOW_TYPE="mlrMBO" ``` -In the sample submission script above, the Python script containing the model (my_specialized_unet.py), the default model parameters (default_params.txt), and the unrolled parameter file (upf1.txt) are all specified in the "unet" subdirectory of the working directory "/home/weismanal/notebook/2019-02-28". However, often a model, its default parameters, and a workflow's settings can be reused. +In the sample submission script above, the Python script containing the model (my_specialized_unet.py), the default model parameters (default_params.txt), and the unrolled parameter file (upf1.txt) are all specified in the "unet" subdirectory of the working directory "/home/weismanal/notebook/2019-02-28". However, often a model, its default parameters, and a workflow's settings can be reused. Thus, we provide templates of these three types of files in the `$CANDLE/Supervisor/templates` directory, the current structure of which is: @@ -102,7 +102,7 @@ export WORKFLOW_SETTINGS_FILE="/home/weismanal/notebook/2019-02-28/unet/upf1.txt export WORKFLOW_SETTINGS_FILE="$CANDLE/Supervisor/templates/workflow_settings/upf1.txt" ``` -The template submission script located at `$CANDLE/Supervisor/templates/submit_candle_job.sh` utilizes all three of these types of templates and will just work (running an HPO on the MNIST dataset) as long as the `$CANDLE` and `$SITE` variables are set correctly. +The template submission script located at `$CANDLE/Supervisor/templates/submit_candle_job.sh` utilizes all three of these types of templates and will just work (running an HPO on the MNIST dataset) as long as the `$CANDLE` and `$SITE` variables are set correctly. ## Notes @@ -119,10 +119,10 @@ mymodel_common = candle.Benchmark(file_path, os.getenv("DEFAULT_PARAMS_FILE"), ' I'd recommend this be added to the standard method for making a model [CANDLE-compliant](https://ecp-candle.github.io/Candle/html/tutorials/writing_candle_code.html). -Note further that `$DEFAULT_PARAMS_FILE` must be a full pathname. Otherwise, if we just used the filename "default_params.txt" hardcoded into the `$MODEL_PYTHON_SCRIPT`, the script would look for this global parameter file in the same directory that it's in (i.e., `$MODEL_PYTHON_DIR`), but that would preclude using a `$MODEL_PYTHON_SCRIPT` that's a symbolic link. In that case, we'd have to always copy the `$MODEL_PYTHON_SCRIPT` to the current working directory, which is inefficient because this leads to unnecessary duplication of code. +Note further that `$DEFAULT_PARAMS_FILE` must be a full pathname. Otherwise, if we just used the filename "default_params.txt" hardcoded into the `$MODEL_PYTHON_SCRIPT`, the script would look for this global parameter file in the same directory that it's in (i.e., `$MODEL_PYTHON_DIR`), but that would preclude using a `$MODEL_PYTHON_SCRIPT` that's a symbolic link. In that case, we'd have to always copy the `$MODEL_PYTHON_SCRIPT` to the current working directory, which is inefficient because this leads to unnecessary duplication of code. ### CANDLE settings at different SITEs -`$SITE` | `$CANDLE` -:---: | :---: -biowulf | /data/BIDS-HPC/public/candle \ No newline at end of file +| `$SITE` | `$CANDLE` | +| :-----: | :--------------------------: | +| biowulf | /data/BIDS-HPC/public/candle | diff --git a/archives/templates/language_agnostic/submit_candle_job.sh b/archives/templates/language_agnostic/submit_candle_job.sh index 990cc07a..51ed573c 100755 --- a/archives/templates/language_agnostic/submit_candle_job.sh +++ b/archives/templates/language_agnostic/submit_candle_job.sh @@ -8,7 +8,7 @@ export SITE="biowulf" # Job specification export EXPERIMENTS="$MY_DIR" #TODO GZ: These 2 variables are not needed -export MODEL_NAME="mnist_upf_test" +export MODEL_NAME="mnist_upf_test" export OBJ_RETURN="val_loss" # Scheduler settings diff --git a/archives/templates/language_agnostic/train_model.py b/archives/templates/language_agnostic/train_model.py index 5013c6a4..9af290a1 100755 --- a/archives/templates/language_agnostic/train_model.py +++ b/archives/templates/language_agnostic/train_model.py @@ -1,8 +1,8 @@ -import sys -import pickle import os +import pickle import random +import sys -#Generate a random loss function +# Generate a random loss function print(str(sys.argv)) -print(random.uniform(0,1)) +print(random.uniform(0, 1)) diff --git a/archives/templates/model_params/mnist1.txt b/archives/templates/model_params/mnist1.txt index 430bec5a..3a33c6ed 100644 --- a/archives/templates/model_params/mnist1.txt +++ b/archives/templates/model_params/mnist1.txt @@ -3,4 +3,4 @@ epochs=20 batch_size=128 activation='relu' optimizer='rmsprop' -num_filters=32 \ No newline at end of file +num_filters=32 diff --git a/archives/templates/model_params/uno1.txt b/archives/templates/model_params/uno1.txt index 12fbf6b7..8b83f3d9 100644 --- a/archives/templates/model_params/uno1.txt +++ b/archives/templates/model_params/uno1.txt @@ -51,4 +51,4 @@ use_landmark_genes = True validation_split = 0.2 verbose = None warmup_lr = False -save='save/uno' \ No newline at end of file +save='save/uno' diff --git a/archives/templates/models/mnist/mnist.py b/archives/templates/models/mnist/mnist.py index 5c5e2837..de2605b1 100644 --- a/archives/templates/models/mnist/mnist.py +++ b/archives/templates/models/mnist/mnist.py @@ -1,13 +1,14 @@ # add candle_keras library in path -candle_lib = '/data/BIDS-HPC/public/candle/Candle/common' +candle_lib = "/data/BIDS-HPC/public/candle/Candle/common" import sys -sys.path.append(candle_lib) +sys.path.append(candle_lib) import os -#import sys + +# import sys file_path = os.path.dirname(os.path.realpath(__file__)) -lib_path = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) +lib_path = os.path.abspath(os.path.join(file_path, "..", "..", "common")) sys.path.append(lib_path) import candle_keras as candle @@ -19,10 +20,11 @@ additional_definitions = None required = None + class MNIST(candle.Benchmark): + def set_locals(self): if required is not None: self.required = set(required) if additional_definitions is not None: self.additional_definitions = additional_definitions - diff --git a/archives/templates/models/mnist/mnist_mlp.py b/archives/templates/models/mnist/mnist_mlp.py index c0c13c12..d2751896 100644 --- a/archives/templates/models/mnist/mnist_mlp.py +++ b/archives/templates/models/mnist/mnist_mlp.py @@ -1,62 +1,64 @@ -import mnist import os -from keras.callbacks import CSVLogger +import mnist from keras import backend as K +from keras.callbacks import CSVLogger + def initialize_parameters(): - mnist_common = mnist.MNIST(mnist.file_path, + mnist_common = mnist.MNIST( + mnist.file_path, os.getenv("DEFAULT_PARAMS_FILE"), - 'keras', - prog='mnist_mlp', - desc='MNIST example' + "keras", + prog="mnist_mlp", + desc="MNIST example", ) import candle_keras as candle # Initialize parameters gParameters = candle.initialize_parameters(mnist_common) - csv_logger = CSVLogger('{}/params.log'.format(gParameters)) + csv_logger = CSVLogger("{}/params.log".format(gParameters)) return gParameters + def run(gParameters): ########################################## # Your DL start here. See mnist_mlp.py # ########################################## - '''Trains a simple deep NN on the MNIST dataset. + """Trains a simple deep NN on the MNIST dataset. - Gets to 98.40% test accuracy after 20 epochs - (there is *a lot* of margin for parameter tuning). - 2 seconds per epoch on a K520 GPU. - ''' + Gets to 98.40% test accuracy after 20 epochs (there is *a lot* of + margin for parameter tuning). 2 seconds per epoch on a K520 GPU. + """ # from __future__ import print_function import keras from keras.datasets import mnist - from keras.models import Sequential from keras.layers import Dense, Dropout + from keras.models import Sequential from keras.optimizers import RMSprop - batch_size = gParameters['batch_size'] + batch_size = gParameters["batch_size"] num_classes = 10 - epochs = gParameters['epochs'] + epochs = gParameters["epochs"] - activation = gParameters['activation'] - optimizer = gParameters['optimizer'] + activation = gParameters["activation"] + optimizer = gParameters["optimizer"] # the data, split between train and test sets (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train = x_train.reshape(60000, 784) x_test = x_test.reshape(10000, 784) - x_train = x_train.astype('float32') - x_test = x_test.astype('float32') + x_train = x_train.astype("float32") + x_test = x_test.astype("float32") x_train /= 255 x_test /= 255 - print(x_train.shape[0], 'train samples') - print(x_test.shape[0], 'test samples') + print(x_train.shape[0], "train samples") + print(x_test.shape[0], "test samples") # convert class vectors to binary class matrices y_train = keras.utils.to_categorical(y_train, num_classes) @@ -67,32 +69,37 @@ def run(gParameters): model.add(Dropout(0.2)) model.add(Dense(512, activation=activation)) model.add(Dropout(0.2)) - model.add(Dense(num_classes, activation='softmax')) + model.add(Dense(num_classes, activation="softmax")) model.summary() - model.compile(loss='categorical_crossentropy', - optimizer=optimizer, - metrics=['accuracy']) - - history = model.fit(x_train, y_train, - batch_size=batch_size, - epochs=epochs, - verbose=1, - validation_data=(x_test, y_test)) + model.compile(loss="categorical_crossentropy", + optimizer=optimizer, + metrics=["accuracy"]) + + history = model.fit( + x_train, + y_train, + batch_size=batch_size, + epochs=epochs, + verbose=1, + validation_data=(x_test, y_test), + ) score = model.evaluate(x_test, y_test, verbose=0) - print('Test loss:', score[0]) - print('Test accuracy:', score[1]) + print("Test loss:", score[0]) + print("Test accuracy:", score[1]) ########################################## # End of mnist_mlp.py #################### ########################################## return history + def main(): gParameters = initialize_parameters() run(gParameters) -if __name__ == '__main__': + +if __name__ == "__main__": main() try: K.clear_session() diff --git a/archives/templates/models/resnet.py b/archives/templates/models/resnet.py index 778e09f5..1de769cf 100644 --- a/archives/templates/models/resnet.py +++ b/archives/templates/models/resnet.py @@ -1,321 +1,439 @@ -from keras import backend as K import os +from keras import backend as K + # Parameters -candle_lib = '/data/BIDS-HPC/public/candle/Candle/common' +candle_lib = "/data/BIDS-HPC/public/candle/Candle/common" + def initialize_parameters(): - print('Initializing parameters...') - + print("Initializing parameters...") + # Obtain the path of the directory of this script file_path = os.path.dirname(os.path.realpath(__file__)) # Import the CANDLE library import sys + sys.path.append(candle_lib) import candle_keras as candle # Instantiate the candle.Benchmark class - mymodel_common = candle.Benchmark(file_path,os.getenv("DEFAULT_PARAMS_FILE"),'keras',prog='myprog',desc='My model') + mymodel_common = candle.Benchmark( + file_path, + os.getenv("DEFAULT_PARAMS_FILE"), + "keras", + prog="myprog", + desc="My model", + ) # Get a dictionary of the model hyperparamters gParameters = candle.initialize_parameters(mymodel_common) # Return the dictionary of the hyperparameters - return(gParameters) - + return gParameters + + def run(gParameters): - print('Running model...') + print("Running model...") #### Begin model input ########################################################################################## - - def get_model(model_json_fname,modelwtsfname): + + def get_model(model_json_fname, modelwtsfname): # This is only for prediction if os.path.isfile(model_json_fname): - # Model reconstruction from JSON file - with open(model_json_fname, 'r') as f: + # Model reconstruction from JSON file + with open(model_json_fname, "r") as f: model = model_from_json(f.read()) else: - model = get_unet() - - #model.summary() + model = get_unet() + + # model.summary() # Load weights into the new model model.load_weights(modelwtsfname) - return model - - def focal_loss(gamma=2., alpha=.25): + return model + + def focal_loss(gamma=2.0, alpha=0.25): + def focal_loss_fixed(y_true, y_pred): pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred)) pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred)) - return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1))-K.sum((1-alpha) * K.pow( pt_0, gamma) * K.log(1. - pt_0)) + return -K.sum( + alpha * K.pow(1.0 - pt_1, gamma) * K.log(pt_1)) - K.sum( + (1 - alpha) * K.pow(pt_0, gamma) * K.log(1.0 - pt_0)) + return focal_loss_fixed - + def jaccard_coef(y_true, y_pred): smooth = 1.0 intersection = K.sum(y_true * y_pred, axis=[-0, -1, 2]) sum_ = K.sum(y_true + y_pred, axis=[-0, -1, 2]) - + jac = (intersection + smooth) / (sum_ - intersection + smooth) - + return K.mean(jac) - + def jaccard_coef_int(y_true, y_pred): smooth = 1.0 y_pred_pos = K.round(K.clip(y_pred, 0, 1)) - + intersection = K.sum(y_true * y_pred_pos, axis=[-0, -1, 2]) sum_ = K.sum(y_true + y_pred_pos, axis=[-0, -1, 2]) - + jac = (intersection + smooth) / (sum_ - intersection + smooth) - + return K.mean(jac) - + def jaccard_coef_loss(y_true, y_pred): - return -K.log(jaccard_coef(y_true, y_pred)) + binary_crossentropy(y_pred, y_true) - + return -K.log(jaccard_coef(y_true, y_pred)) + binary_crossentropy( + y_pred, y_true) + def dice_coef_batch(y_true, y_pred): smooth = 1.0 intersection = K.sum(y_true * y_pred, axis=[-0, -1, 2]) sum_ = K.sum(y_true + y_pred, axis=[-0, -1, 2]) - - dice = ((2.0*intersection) + smooth) / (sum_ + intersection + smooth) - + + dice = ((2.0 * intersection) + smooth) / (sum_ + intersection + smooth) + return K.mean(dice) - + def dice_coef(y_true, y_pred): smooth = 1.0 y_true_f = K.flatten(y_true) y_pred_f = K.flatten(y_pred) intersection = K.sum(y_true_f * y_pred_f) - dice_smooth = ((2. * intersection) + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth) - return (dice_smooth) - + dice_smooth = ((2.0 * intersection) + + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth) + return dice_smooth + def dice_coef_loss(y_true, y_pred): return -dice_coef(y_true, y_pred) - + def dice_coef_batch_loss(y_true, y_pred): return -dice_coef_batch(y_true, y_pred) - - #Define the neural network + + # Define the neural network def get_unet(): droprate = 0.25 filt_size = 32 inputs = Input((None, None, 1)) - conv1 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(inputs) - conv1 = Dropout(droprate)(conv1) - conv1 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(conv1) + conv1 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(inputs) + conv1 = Dropout(droprate)(conv1) + conv1 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(conv1) pool1 = MaxPooling2D(pool_size=(2, 2))(conv1) - filt_size = filt_size*2 - - conv2 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(pool1) - conv2 = Dropout(droprate)(conv2) - conv2 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(conv2) + filt_size = filt_size * 2 + + conv2 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(pool1) + conv2 = Dropout(droprate)(conv2) + conv2 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(conv2) pool2 = MaxPooling2D(pool_size=(2, 2))(conv2) - filt_size = filt_size*2 - - conv3 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(pool2) - conv3 = Dropout(droprate)(conv3) - conv3 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(conv3) + filt_size = filt_size * 2 + + conv3 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(pool2) + conv3 = Dropout(droprate)(conv3) + conv3 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(conv3) pool3 = MaxPooling2D(pool_size=(2, 2))(conv3) - filt_size = filt_size*2 - - conv4 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(pool3) - conv4 = Dropout(droprate)(conv4) - conv4 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(conv4) + filt_size = filt_size * 2 + + conv4 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(pool3) + conv4 = Dropout(droprate)(conv4) + conv4 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(conv4) pool4 = MaxPooling2D(pool_size=(2, 2))(conv4) - filt_size = filt_size*2 - - conv5 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(pool4) + filt_size = filt_size * 2 + + conv5 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(pool4) conv5 = Dropout(droprate)(conv5) - conv5 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(conv5) - - filt_size = filt_size/2 - - up6 = concatenate([Conv2DTranspose(filt_size, (2, 2), strides=(2, 2), padding='same')(conv5), conv4], axis=3) - conv6 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(up6) + conv5 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(conv5) + + filt_size = filt_size / 2 + + up6 = concatenate( + [ + Conv2DTranspose( + filt_size, (2, 2), strides=(2, 2), padding="same")(conv5), + conv4, + ], + axis=3, + ) + conv6 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(up6) conv6 = Dropout(droprate)(conv6) - conv6 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(conv6) - - filt_size = filt_size/2 - - up7 = concatenate([Conv2DTranspose(filt_size, (2, 2), strides=(2, 2), padding='same')(conv6), conv3], axis=3) - conv7 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(up7) + conv6 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(conv6) + + filt_size = filt_size / 2 + + up7 = concatenate( + [ + Conv2DTranspose( + filt_size, (2, 2), strides=(2, 2), padding="same")(conv6), + conv3, + ], + axis=3, + ) + conv7 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(up7) conv7 = Dropout(droprate)(conv7) - conv7 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(conv7) - - filt_size = filt_size/2 - - up8 = concatenate([Conv2DTranspose(filt_size, (2, 2), strides=(2, 2), padding='same')(conv7), conv2], axis=3) - conv8 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(up8) + conv7 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(conv7) + + filt_size = filt_size / 2 + + up8 = concatenate( + [ + Conv2DTranspose( + filt_size, (2, 2), strides=(2, 2), padding="same")(conv7), + conv2, + ], + axis=3, + ) + conv8 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(up8) conv8 = Dropout(droprate)(conv8) - conv8 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(conv8) - filt_size = filt_size/2 - - up9 = concatenate([Conv2DTranspose(filt_size, (2, 2), strides=(2, 2), padding='same')(conv8), conv1], axis=3) - conv9 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(up9) + conv8 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(conv8) + filt_size = filt_size / 2 + + up9 = concatenate( + [ + Conv2DTranspose( + filt_size, (2, 2), strides=(2, 2), padding="same")(conv8), + conv1, + ], + axis=3, + ) + conv9 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(up9) conv9 = Dropout(droprate)(conv9) - conv9 = Conv2D(filt_size, (3, 3), activation='relu', padding='same')(conv9) - - - conv10 = Conv2D(1, (1, 1), activation='sigmoid')(conv9) - + conv9 = Conv2D(filt_size, (3, 3), activation="relu", + padding="same")(conv9) + + conv10 = Conv2D(1, (1, 1), activation="sigmoid")(conv9) + model = Model(inputs=[inputs], outputs=[conv10]) - - #model.compile(optimizer=Adam(lr=1e-5), loss=dice_coef_loss, metrics=[dice_coef]) - #model.compile(optimizer=Nadam(lr=1e-3), loss=dice_coef_loss, metrics=[dice_coef]) - #model.compile(optimizer=Adadelta(), loss=dice_coef_loss, metrics=[dice_coef]) - + + # model.compile(optimizer=Adam(lr=1e-5), loss=dice_coef_loss, metrics=[dice_coef]) + # model.compile(optimizer=Nadam(lr=1e-3), loss=dice_coef_loss, metrics=[dice_coef]) + # model.compile(optimizer=Adadelta(), loss=dice_coef_loss, metrics=[dice_coef]) + return model - - def save_model_to_json(model,model_json_fname): - - #model = unet.UResNet152(input_shape=(None, None, 3), classes=1,encoder_weights="imagenet11k") - #model = get_unet() - - #model.summary() + + def save_model_to_json(model, model_json_fname): + + # model = unet.UResNet152(input_shape=(None, None, 3), classes=1,encoder_weights="imagenet11k") + # model = get_unet() + + # model.summary() # serialize model to JSON model_json = model.to_json() with open(model_json_fname, "w") as json_file: - json_file.write(model_json) - - def preprocess_data(do_prediction,inputnpyfname,targetnpyfname,expandChannel,backbone): + json_file.write(model_json) + + def preprocess_data(do_prediction, inputnpyfname, targetnpyfname, + expandChannel, backbone): # Preprocess the data (beyond what I already did before) - - print('-'*30) - print('Loading and preprocessing data...') - print('-'*30) - + + print("-" * 30) + print("Loading and preprocessing data...") + print("-" * 30) + # Load, normalize, and cast the data - imgs_input = ( np.load(inputnpyfname).astype('float32') / (2**16-1) * (2**8-1) ).astype('uint8') - print('Input images information:') + imgs_input = (np.load(inputnpyfname).astype("float32") / (2**16 - 1) * + (2**8 - 1)).astype("uint8") + print("Input images information:") print(imgs_input.shape) print(imgs_input.dtype) - hist,bins = np.histogram(imgs_input) + hist, bins = np.histogram(imgs_input) print(hist) print(bins) if not do_prediction: - imgs_mask_train = np.load(targetnpyfname).astype('uint8') - print('Input masks information:') + imgs_mask_train = np.load(targetnpyfname).astype("uint8") + print("Input masks information:") print(imgs_mask_train.shape) print(imgs_mask_train.dtype) - hist,bins = np.histogram(imgs_mask_train) + hist, bins = np.histogram(imgs_mask_train) print(hist) print(bins) - + # Make the grayscale images RGB since that's what the model expects apparently - if expandChannel: - imgs_input = np.stack((imgs_input,)*3, -1) + if expandChannel: + imgs_input = np.stack((imgs_input,) * 3, -1) else: - imgs_input = np.expand_dims(imgs_input, 3) - print('New shape of input images:') + imgs_input = np.expand_dims(imgs_input, 3) + print("New shape of input images:") print(imgs_input.shape) if not do_prediction: - imgs_mask_train = np.expand_dims(imgs_mask_train, 3) - print('New shape of masks:') - print(imgs_mask_train.shape) - + imgs_mask_train = np.expand_dims(imgs_mask_train, 3) + print("New shape of masks:") + print(imgs_mask_train.shape) + # Preprocess as per https://github.com/qubvel/segmentation_models preprocessing_fn = get_preprocessing(backbone) imgs_input = preprocessing_fn(imgs_input) - + # Return appropriate variables if not do_prediction: - return(imgs_input,imgs_mask_train) + return (imgs_input, imgs_mask_train) else: - return(imgs_input) + return imgs_input # Import relevant modules and functions import sys - sys.path.append(gParameters['segmentation_models_repo']) - import numpy as np - from keras.models import Model - from keras.layers import Input, concatenate, Conv2D, MaxPooling2D, Conv2DTranspose, Dropout - from keras.optimizers import Adam - from keras.callbacks import ModelCheckpoint,ReduceLROnPlateau,EarlyStopping,CSVLogger - from keras.layers.normalization import BatchNormalization - from keras.backend import binary_crossentropy - import keras + + sys.path.append(gParameters["segmentation_models_repo"]) import random + + import keras + import numpy as np import tensorflow as tf - from keras.models import model_from_json + from keras.backend import binary_crossentropy + from keras.callbacks import ( + CSVLogger, + EarlyStopping, + ModelCheckpoint, + ReduceLROnPlateau, + ) + from keras.layers import ( + Conv2D, + Conv2DTranspose, + Dropout, + Input, + MaxPooling2D, + concatenate, + ) + from keras.layers.normalization import BatchNormalization + from keras.models import Model, model_from_json + from keras.optimizers import Adam from segmentation_models import Unet from segmentation_models.backbones import get_preprocessing - K.set_image_data_format('channels_last') # TF dimension ordering in this code - + + K.set_image_data_format( + "channels_last") # TF dimension ordering in this code + # Basically constants expandChannel = True - modelwtsfname = 'model_weights.h5' - model_json_fname = 'model.json' - csvfname = 'model.csv' - - do_prediction = gParameters['predict'] - if not do_prediction: # Train... - print('Training...') + modelwtsfname = "model_weights.h5" + model_json_fname = "model.json" + csvfname = "model.csv" + + do_prediction = gParameters["predict"] + if not do_prediction: # Train... + print("Training...") # Parameters - inputnpyfname = gParameters['images'] - labels = gParameters['labels'] - initialize = gParameters['initialize'] - backbone = gParameters['backbone'] - encoder = gParameters['encoder'] - lr = float(gParameters['lr']) - batch_size = gParameters['batch_size'] - obj_return = gParameters['obj_return'] - epochs = gParameters['epochs'] + inputnpyfname = gParameters["images"] + labels = gParameters["labels"] + initialize = gParameters["initialize"] + backbone = gParameters["backbone"] + encoder = gParameters["encoder"] + lr = float(gParameters["lr"]) + batch_size = gParameters["batch_size"] + obj_return = gParameters["obj_return"] + epochs = gParameters["epochs"] # Preprocess the data - imgs_train,imgs_mask_train = preprocess_data(do_prediction,inputnpyfname,labels,expandChannel,backbone) + imgs_train, imgs_mask_train = preprocess_data(do_prediction, + inputnpyfname, labels, + expandChannel, backbone) # Load, save, and compile the model model = Unet(backbone_name=backbone, encoder_weights=encoder) - save_model_to_json(model,model_json_fname) - model.compile(optimizer=Adam(lr=lr), loss='binary_crossentropy', metrics=['binary_crossentropy','mean_squared_error',dice_coef, dice_coef_batch, focal_loss()]) + save_model_to_json(model, model_json_fname) + model.compile( + optimizer=Adam(lr=lr), + loss="binary_crossentropy", + metrics=[ + "binary_crossentropy", + "mean_squared_error", + dice_coef, + dice_coef_batch, + focal_loss(), + ], + ) # Load previous weights for restarting, if desired and possible if os.path.isfile(initialize): - print('-'*30) - print('Loading previous weights ...') + print("-" * 30) + print("Loading previous weights ...") model.load_weights(initialize) # Set up the training callback functions - model_checkpoint = ModelCheckpoint(modelwtsfname, monitor=obj_return, save_best_only=True) - reduce_lr = ReduceLROnPlateau(monitor=obj_return, factor=0.1,patience=100, min_lr=0.001,verbose=1) - model_es = EarlyStopping(monitor=obj_return, min_delta=0.00000001, patience=100, verbose=1, mode='auto') + model_checkpoint = ModelCheckpoint(modelwtsfname, + monitor=obj_return, + save_best_only=True) + reduce_lr = ReduceLROnPlateau(monitor=obj_return, + factor=0.1, + patience=100, + min_lr=0.001, + verbose=1) + model_es = EarlyStopping( + monitor=obj_return, + min_delta=0.00000001, + patience=100, + verbose=1, + mode="auto", + ) csv_logger = CSVLogger(csvfname, append=True) # Train the model - history_callback = model.fit(imgs_train, imgs_mask_train, batch_size=batch_size, epochs=epochs, verbose=2, shuffle=True, validation_split=0.10, callbacks=[model_checkpoint, reduce_lr, model_es, csv_logger]) + history_callback = model.fit( + imgs_train, + imgs_mask_train, + batch_size=batch_size, + epochs=epochs, + verbose=2, + shuffle=True, + validation_split=0.10, + callbacks=[model_checkpoint, reduce_lr, model_es, csv_logger], + ) print("Minimum validation loss:") print(min(history_callback.history[obj_return])) - else: # ...or predict - print('Inferring...') + else: # ...or predict + print("Inferring...") # Parameters - inputnpyfname = gParameters['images'] - initialize = gParameters['initialize'] - backbone = gParameters['backbone'] + inputnpyfname = gParameters["images"] + initialize = gParameters["initialize"] + backbone = gParameters["backbone"] # lr = float(gParameters['lr']) # this isn't needed but we're keeping it for the U-Net, where it is "needed" # Preprocess the data - imgs_infer = preprocess_data(do_prediction,inputnpyfname,'',expandChannel,backbone) + imgs_infer = preprocess_data(do_prediction, inputnpyfname, "", + expandChannel, backbone) # Load the model - #model = get_model(model_json_fname,initialize) - model = get_model(os.path.dirname(initialize)+'/'+model_json_fname,initialize) - + # model = get_model(model_json_fname,initialize) + model = get_model( + os.path.dirname(initialize) + "/" + model_json_fname, initialize) + # Run inference imgs_test_predict = model.predict(imgs_infer, batch_size=1, verbose=1) # Save the predicted masks - np.save('mask_predictions.npy', np.squeeze(np.round(imgs_test_predict).astype('uint8'))) + np.save( + "mask_predictions.npy", + np.squeeze(np.round(imgs_test_predict).astype("uint8")), + ) history_callback = None - + #### End model input ############################################################################################ - - return(history_callback) + + return history_callback + def main(): - print('Running main program...') + print("Running main program...") gParameters = initialize_parameters() run(gParameters) -if __name__ == '__main__': + +if __name__ == "__main__": main() try: K.clear_session() except AttributeError: - pass \ No newline at end of file + pass diff --git a/archives/templates/models/unet.py b/archives/templates/models/unet.py index 0f609877..8dfbf697 100644 --- a/archives/templates/models/unet.py +++ b/archives/templates/models/unet.py @@ -1,12 +1,13 @@ # Import relevant modules -from keras import backend as K import numpy as np +from keras import backend as K # Parameters -candle_lib = '/data/BIDS-HPC/public/candle/Candle/common' +candle_lib = "/data/BIDS-HPC/public/candle/Candle/common" + def initialize_parameters(): - print('Initializing parameters...') + print("Initializing parameters...") import os @@ -15,20 +16,28 @@ def initialize_parameters(): # Import the CANDLE library import sys + sys.path.append(candle_lib) import candle_keras as candle # Instantiate the candle.Benchmark class - mymodel_common = candle.Benchmark(file_path,os.getenv("DEFAULT_PARAMS_FILE"),'keras',prog='myprog',desc='My model') + mymodel_common = candle.Benchmark( + file_path, + os.getenv("DEFAULT_PARAMS_FILE"), + "keras", + prog="myprog", + desc="My model", + ) # Get a dictionary of the model hyperparamters gParameters = candle.initialize_parameters(mymodel_common) # Return the dictionary of the hyperparameters - return(gParameters) + return gParameters + def run(gParameters): - print('Running model...') + print("Running model...") #### Begin model input ########################################################################################## # Currently based off run_unet.py @@ -40,10 +49,10 @@ def focal_loss(labels, logits, gamma=0, alpha=1.0): Notice: logits is probability after softmax gradient is d(Fl)/d(p_t) not d(Fl)/d(x) as described in paper d(Fl)/d(p_t) * [p_t(1-p_t)] = d(Fl)/d(x) - - Focal Loss for Dense Object Detection, + + Focal Loss for Dense Object Detection, https://doi.org/10.1016/j.ajodo.2005.02.022 - + :param labels: ground truth labels, shape of [batch_size] :param logits: model's output, shape of [batch_size, num_cls] :param gamma: @@ -53,87 +62,125 @@ def focal_loss(labels, logits, gamma=0, alpha=1.0): import tensorflow as tf - epsilon = 1.e-9 + epsilon = 1.0e-9 labels = tf.to_int64(labels) labels = tf.convert_to_tensor(labels, tf.int64) logits = tf.convert_to_tensor(logits, tf.float32) num_cls = logits.shape[1] - + model_out = tf.add(logits, epsilon) onehot_labels = tf.one_hot(labels, num_cls) ce = tf.multiply(onehot_labels, -tf.log(model_out)) - weight = tf.multiply(onehot_labels, tf.pow(tf.subtract(1., model_out), gamma)) + weight = tf.multiply(onehot_labels, + tf.pow(tf.subtract(1.0, model_out), gamma)) fl = tf.multiply(alpha, tf.multiply(weight, ce)) reduced_fl = tf.reduce_max(fl, axis=1) # reduced_fl = tf.reduce_sum(fl, axis=1) # same as reduce_max return reduced_fl def dice_coef(y_true, y_pred): - smooth = 1. - intersection = K.sum(y_true * y_pred, axis=[1,2,3]) - union = K.sum(y_true, axis=[1,2,3]) + K.sum(y_pred, axis=[1,2,3]) - dc = K.mean( (2. * intersection + smooth) / (union + smooth), axis=0) + smooth = 1.0 + intersection = K.sum(y_true * y_pred, axis=[1, 2, 3]) + union = K.sum(y_true, axis=[1, 2, 3]) + K.sum(y_pred, axis=[1, 2, 3]) + dc = K.mean((2.0 * intersection + smooth) / (union + smooth), axis=0) return dc def dice_coef_loss(y_true, y_pred): return -dice_coef(y_true, y_pred) - def get_unet(img_rows, img_cols, n_layers, filter_size, dropout, activation_func, conv_size, loss_func, last_activation, batch_norm, learning_rate): - - print('-'*30) - print('Creating and compiling model...') - print('-'*30) - print (img_rows) - print (img_cols) + def get_unet( + img_rows, + img_cols, + n_layers, + filter_size, + dropout, + activation_func, + conv_size, + loss_func, + last_activation, + batch_norm, + learning_rate, + ): + + print("-" * 30) + print("Creating and compiling model...") + print("-" * 30) + print(img_rows) + print(img_cols) inputs = Input((img_rows, img_cols, 1)) - conv_layers=[] - pool_layers=[inputs] - conv_filter=(conv_size, conv_size ) + conv_layers = [] + pool_layers = [inputs] + conv_filter = (conv_size, conv_size) for i in range(n_layers): - conv = Conv2D(filter_size, conv_filter, activation=activation_func, padding='same')(pool_layers[i]) + conv = Conv2D(filter_size, + conv_filter, + activation=activation_func, + padding="same")(pool_layers[i]) conv = BatchNormalization()(conv) if batch_norm else conv if dropout != None: conv = Dropout(dropout)(conv) - conv = Conv2D(filter_size, conv_filter, activation=activation_func, padding='same')(conv) + conv = Conv2D(filter_size, + conv_filter, + activation=activation_func, + padding="same")(conv) conv = BatchNormalization()(conv) if batch_norm else conv pool = MaxPooling2D(pool_size=(2, 2))(conv) conv_layers.append(conv) pool_layers.append(pool) - filter_size *=2 - - filter_size /=4 + filter_size *= 2 + + filter_size /= 4 - for i in range(n_layers-1): + for i in range(n_layers - 1): filter_size = int(filter_size) - up = concatenate([Conv2DTranspose(filter_size, (2, 2), strides=(2, 2), padding='same')(conv_layers[-1]), conv_layers[n_layers-i-2]], axis=3) - conv = Conv2D(filter_size, conv_filter, activation=activation_func, padding='same')(up) + up = concatenate( + [ + Conv2DTranspose(filter_size, (2, 2), + strides=(2, 2), + padding="same")(conv_layers[-1]), + conv_layers[n_layers - i - 2], + ], + axis=3, + ) + conv = Conv2D(filter_size, + conv_filter, + activation=activation_func, + padding="same")(up) conv = BatchNormalization()(conv) if batch_norm else conv - if dropout != None: + if dropout != None: conv = Dropout(dropout)(conv) - conv = Conv2D(filter_size, conv_filter, activation=activation_func, padding='same')(conv) + conv = Conv2D(filter_size, + conv_filter, + activation=activation_func, + padding="same")(conv) conv = BatchNormalization()(conv) if batch_norm else conv conv_layers.append(conv) filter_size /= 2 - #For binary classification, last activation should be sigmoid. + # For binary classification, last activation should be sigmoid. # if loss_func == 'dice': # last_activation = 'sigmoid' # else: # print ("WARNING: last_activation set to None") # last_activation = None - last_conv = Conv2D(1, (1, 1), activation=last_activation)(conv_layers[-1]) + last_conv = Conv2D(1, (1, 1), + activation=last_activation)(conv_layers[-1]) conv_layers.append(last_conv) - + model = Model(inputs=[inputs], outputs=[last_conv]) - - if loss_func == 'dice': - model.compile(optimizer=Adam(lr=learning_rate), loss=dice_coef_loss, metrics=[dice_coef]) + + if loss_func == "dice": + model.compile( + optimizer=Adam(lr=learning_rate), + loss=dice_coef_loss, + metrics=[dice_coef], + ) else: - #Any Keras loss function will be passed - model.compile(optimizer=Adam(lr=learning_rate), loss = loss_func) + # Any Keras loss function will be passed + model.compile(optimizer=Adam(lr=learning_rate), loss=loss_func) model.summary() model_json = model.to_json() with open("model.json", "w") as json_file: @@ -142,220 +189,328 @@ def get_unet(img_rows, img_cols, n_layers, filter_size, dropout, activation_func def get_images(images, masks, normalize_mask=False): - print('-'*30) - print('Loading and preprocessing train data...') - print('-'*30) + print("-" * 30) + print("Loading and preprocessing train data...") + print("-" * 30) - imgs_train = preprocess_images(images) - imgs_mask_train = preprocess_masks(masks, normalize_mask) + imgs_train = preprocess_images(images) + imgs_mask_train = preprocess_masks(masks, normalize_mask) - #Shuffle the images + # Shuffle the images np.random.seed(10) shuffled_id = np.random.permutation(imgs_train.shape[0]) imgs_train = imgs_train[shuffled_id] imgs_mask_train = imgs_mask_train[shuffled_id] - assert(np.amax(imgs_mask_train) <= 1) - assert(np.amin(imgs_mask_train) >= 0) - return_images = imgs_train - return_masks = imgs_mask_train + assert np.amax(imgs_mask_train) <= 1 + assert np.amin(imgs_mask_train) >= 0 + return_images = imgs_train + return_masks = imgs_mask_train - print (np.shape(return_images)) - print (np.shape(return_masks)) + print(np.shape(return_images)) + print(np.shape(return_masks)) return [return_images, return_masks] - def evaluate_params(images, labels, batch_size, epochs, obj_return, initialize, n_layers, filter_size, dropout, activation_func, conv_size, loss_func, last_activation, batch_norm, learning_rate): - - images , masks = get_images(images,labels) - - print("Training images histogram") + def evaluate_params( + images, + labels, + batch_size, + epochs, + obj_return, + initialize, + n_layers, + filter_size, + dropout, + activation_func, + conv_size, + loss_func, + last_activation, + batch_norm, + learning_rate, + ): + + images, masks = get_images(images, labels) + + print("Training images histogram") hist, bin_edges = np.histogram(images) print(hist) print(bin_edges) - - print("Training masks histogram") + + print("Training masks histogram") hist, bin_edges = np.histogram(masks) print(hist) print(bin_edges) - - #Get the images size + + # Get the images size img_rows = np.shape(images)[1] img_cols = np.shape(images)[2] - - model = get_unet(img_rows, img_cols, n_layers, filter_size, dropout, activation_func, conv_size, loss_func, last_activation, batch_norm, learning_rate) - - history_callback = train(model, images, masks, batch_size, epochs, obj_return, initialize=initialize) - return history_callback # note that history_callback is what's returned by model.fit() + + model = get_unet( + img_rows, + img_cols, + n_layers, + filter_size, + dropout, + activation_func, + conv_size, + loss_func, + last_activation, + batch_norm, + learning_rate, + ) + + history_callback = train(model, + images, + masks, + batch_size, + epochs, + obj_return, + initialize=initialize) + return history_callback # note that history_callback is what's returned by model.fit() def preprocess_images(images): imgs_train = np.squeeze(np.load(images)) if imgs_train.ndim != 3: - raise Exception("Error: The number of dimensions for images should equal 3, after squeezing the shape is:{0}".format(np.shape(images))) - imgs_train = imgs_train.astype('float32') + raise Exception( + "Error: The number of dimensions for images should equal 3, after squeezing the shape is:{0}" + .format(np.shape(images))) + imgs_train = imgs_train.astype("float32") print("MAX before:{0}".format(np.amax(imgs_train))) - #Normalize all number between 0 and 1. - uint16_info = np.iinfo('uint16') + # Normalize all number between 0 and 1. + uint16_info = np.iinfo("uint16") imgs_train = imgs_train / uint16_info.max print("MAX after:{0}".format(np.amax(imgs_train))) - imgs_train = np.expand_dims(imgs_train, axis= 3) + imgs_train = np.expand_dims(imgs_train, axis=3) return imgs_train def preprocess_masks(masks, normalize_mask=False): imgs_mask_train = np.squeeze(np.load(masks)) if imgs_mask_train.ndim != 3: - raise Exception("Error: The number of dimensions for masks should equal 3, after squeezing the shape is:{0}".format(np.shape(masks))) - imgs_mask_train = imgs_mask_train.astype('float32') + raise Exception( + "Error: The number of dimensions for masks should equal 3, after squeezing the shape is:{0}" + .format(np.shape(masks))) + imgs_mask_train = imgs_mask_train.astype("float32") if normalize_mask: - imgs_mask_train /= 255. # scale masks to [0, 1] - imgs_mask_train = np.expand_dims(imgs_mask_train, axis= 3) + imgs_mask_train /= 255.0 # scale masks to [0, 1] + imgs_mask_train = np.expand_dims(imgs_mask_train, axis=3) return imgs_mask_train - def train(model, imgs_train, imgs_mask_train, batch_size, epochs, obj_return, initialize=None): - - model_checkpoint = ModelCheckpoint(modelwtsfname, monitor=obj_return, save_best_only=True) - reduce_lr = ReduceLROnPlateau(monitor=obj_return, factor=0.1,patience=100, verbose=1) - model_es = EarlyStopping(monitor=obj_return, min_delta=0.000001, patience=400, verbose=1, mode='auto') - csv_logger = CSVLogger('training.csv') - - print('-'*30) - print('Fitting model...') - print('-'*30) - + def train( + model, + imgs_train, + imgs_mask_train, + batch_size, + epochs, + obj_return, + initialize=None, + ): + + model_checkpoint = ModelCheckpoint(modelwtsfname, + monitor=obj_return, + save_best_only=True) + reduce_lr = ReduceLROnPlateau(monitor=obj_return, + factor=0.1, + patience=100, + verbose=1) + model_es = EarlyStopping(monitor=obj_return, + min_delta=0.000001, + patience=400, + verbose=1, + mode="auto") + csv_logger = CSVLogger("training.csv") + + print("-" * 30) + print("Fitting model...") + print("-" * 30) + if initialize != None: print("Initializing the model using:{0}\n", initialize) model.load_weights(initialize) - - #test_call=TestCallback((imgs_train,imgs_mask_train)) - + + # test_call=TestCallback((imgs_train,imgs_mask_train)) + print(np.shape(imgs_train)) print(np.shape(imgs_mask_train)) - #return model.fit(imgs_train, imgs_mask_train, batch_size=2, epochs=3000, verbose=2, shuffle=True, - return model.fit(imgs_train, imgs_mask_train, batch_size=batch_size, epochs=epochs, verbose=2, shuffle=True, - #return model.fit(imgs_train, imgs_mask_train, batch_size=2, epochs=1500, verbose=2, shuffle=True, - #return model.fit(imgs_train, imgs_mask_train, batch_size=2, epochs=4, verbose=2, shuffle=True, - validation_split=0.10, callbacks=[model_checkpoint, reduce_lr, model_es, csv_logger]) + # return model.fit(imgs_train, imgs_mask_train, batch_size=2, epochs=3000, verbose=2, shuffle=True, + return model.fit( + imgs_train, + imgs_mask_train, + batch_size=batch_size, + epochs=epochs, + verbose=2, + shuffle=True, + # return model.fit(imgs_train, imgs_mask_train, batch_size=2, epochs=1500, verbose=2, shuffle=True, + # return model.fit(imgs_train, imgs_mask_train, batch_size=2, epochs=4, verbose=2, shuffle=True, + validation_split=0.10, + callbacks=[model_checkpoint, reduce_lr, model_es, csv_logger], + ) def predict(model, weights, images): - print('-'*30) - print('Loading and preprocessing test data...') - print('-'*30) - - #imgs_test = np.load('./data_python/1CDT_Green_Red_FarRed_Annotated_FISH_Dilation4Conn1Iter_Testing_128by128_normalize.npy') - #imgs_mask_test = np.load('.//data_python/1CDT_Green_Red_FarRed_Annotated_FISH_Dilation4Conn1Iter_Testing_128by128_normalize_Mask.npy') - #imgs_test = imgs_test.astype('float32') - - #imgs_train = np.load('../data_python/1CDT_Green_Red_Annotated_FISH_Dilation8Conn1Iter_Training_128by128.npy') - #imgs_train = imgs_train.astype('float32') - #mean = np.mean(imgs_train) # mean for data centering - #std = np.std(imgs_train) # std for data normalization - #del imgs_train - #imgs_test -= mean - #imgs_test /= std - - print('-'*30) - print('Loading saved weights...') - print('-'*30) + print("-" * 30) + print("Loading and preprocessing test data...") + print("-" * 30) + + # imgs_test = np.load('./data_python/1CDT_Green_Red_FarRed_Annotated_FISH_Dilation4Conn1Iter_Testing_128by128_normalize.npy') + # imgs_mask_test = np.load('.//data_python/1CDT_Green_Red_FarRed_Annotated_FISH_Dilation4Conn1Iter_Testing_128by128_normalize_Mask.npy') + # imgs_test = imgs_test.astype('float32') + + # imgs_train = np.load('../data_python/1CDT_Green_Red_Annotated_FISH_Dilation8Conn1Iter_Training_128by128.npy') + # imgs_train = imgs_train.astype('float32') + # mean = np.mean(imgs_train) # mean for data centering + # std = np.std(imgs_train) # std for data normalization + # del imgs_train + # imgs_test -= mean + # imgs_test /= std + + print("-" * 30) + print("Loading saved weights...") + print("-" * 30) model.load_weights(weights) - print('-'*30) - print('Predicting masks on test data...') - print('-'*30) - #imgs_test = np.expand_dims(imgs_test,3) - - print ('{0}'.format(np.shape(images))) - print ('{0}'.format(type(images))) + print("-" * 30) + print("Predicting masks on test data...") + print("-" * 30) + # imgs_test = np.expand_dims(imgs_test,3) + print("{0}".format(np.shape(images))) + print("{0}".format(type(images))) - print("Inference images histogram") + print("Inference images histogram") hist, bin_edges = np.histogram(images) print(hist) print(bin_edges) - imgs_mask_test = model.predict(images, batch_size = 1,verbose=1) + imgs_mask_test = model.predict(images, batch_size=1, verbose=1) - print("Inference predictions histogram") + print("Inference predictions histogram") hist, bin_edges = np.histogram(imgs_mask_test) print(hist) print(bin_edges) - - #np.save('mask_predictions.npy', np.squeeze(imgs_mask_test)) - np.save('mask_predictions.npy', np.squeeze(np.round(imgs_mask_test).astype('uint8'))) + + # np.save('mask_predictions.npy', np.squeeze(imgs_mask_test)) + np.save("mask_predictions.npy", + np.squeeze(np.round(imgs_mask_test).astype("uint8"))) # Import relevant modules and functions + import pickle + + from keras.callbacks import ( + Callback, + CSVLogger, + EarlyStopping, + ModelCheckpoint, + ReduceLROnPlateau, + ) + from keras.layers import ( + BatchNormalization, + Conv2D, + Conv2DTranspose, + Dropout, + Input, + MaxPooling2D, + concatenate, + ) from keras.models import Model - from keras.layers import Input, concatenate, Conv2D, MaxPooling2D, Conv2DTranspose, Dropout, BatchNormalization from keras.optimizers import Adam - from keras.callbacks import ModelCheckpoint,ReduceLROnPlateau,EarlyStopping, CSVLogger, Callback - import pickle # Basically a constant - modelwtsfname = 'model_weights.h5' + modelwtsfname = "model_weights.h5" - if not gParameters['predict']: - print('Training...') + if not gParameters["predict"]: + print("Training...") # Parameters - n_layers = gParameters['nlayers'] - filter_size = gParameters['num_filters'] - dropout = gParameters['dropout'] - activation_func = gParameters['activation'] - conv_size = gParameters['conv_size'] - loss_func = gParameters['loss_func'] - last_activation = gParameters['last_act'] - batch_norm = gParameters['batch_norm'] - learning_rate = float(gParameters['lr']) - images = gParameters['images'] - labels = gParameters['labels'] - batch_size = gParameters['batch_size'] - epochs = gParameters['epochs'] - obj_return = gParameters['obj_return'] - initialize = gParameters['initialize'] - - history_callback = evaluate_params(images, labels, batch_size, epochs, obj_return, initialize, n_layers, filter_size, dropout, activation_func, conv_size, loss_func, last_activation, batch_norm, learning_rate) # note that history_callback is what's returned by model.fit() + n_layers = gParameters["nlayers"] + filter_size = gParameters["num_filters"] + dropout = gParameters["dropout"] + activation_func = gParameters["activation"] + conv_size = gParameters["conv_size"] + loss_func = gParameters["loss_func"] + last_activation = gParameters["last_act"] + batch_norm = gParameters["batch_norm"] + learning_rate = float(gParameters["lr"]) + images = gParameters["images"] + labels = gParameters["labels"] + batch_size = gParameters["batch_size"] + epochs = gParameters["epochs"] + obj_return = gParameters["obj_return"] + initialize = gParameters["initialize"] + + history_callback = evaluate_params( + images, + labels, + batch_size, + epochs, + obj_return, + initialize, + n_layers, + filter_size, + dropout, + activation_func, + conv_size, + loss_func, + last_activation, + batch_norm, + learning_rate, + ) # note that history_callback is what's returned by model.fit() print("Minimum validation loss:") print(min(history_callback.history[obj_return])) - #Save the history as pickle object - pickle.dump(history_callback.history, open( "fit_history.p", "wb" ) ) + # Save the history as pickle object + pickle.dump(history_callback.history, open("fit_history.p", "wb")) else: - print('Inferring...') + print("Inferring...") # Parameters - n_layers = gParameters['nlayers'] - filter_size = gParameters['num_filters'] - dropout = gParameters['dropout'] - activation_func = gParameters['activation'] - conv_size = gParameters['conv_size'] - loss_func = gParameters['loss_func'] - last_activation = gParameters['last_act'] - batch_norm = gParameters['batch_norm'] - learning_rate = float(gParameters['lr']) - images = gParameters['images'] - initialize = gParameters['initialize'] - - #It is not necessary to pass masks for prediction, but I am just following the function - #prototype for now. + n_layers = gParameters["nlayers"] + filter_size = gParameters["num_filters"] + dropout = gParameters["dropout"] + activation_func = gParameters["activation"] + conv_size = gParameters["conv_size"] + loss_func = gParameters["loss_func"] + last_activation = gParameters["last_act"] + batch_norm = gParameters["batch_norm"] + learning_rate = float(gParameters["lr"]) + images = gParameters["images"] + initialize = gParameters["initialize"] + + # It is not necessary to pass masks for prediction, but I am just following the function + # prototype for now. images = preprocess_images(images) - #Get the images size + # Get the images size img_rows = np.shape(images)[1] img_cols = np.shape(images)[2] - model = get_unet(img_rows, img_cols, n_layers, filter_size, dropout, activation_func, conv_size, loss_func, last_activation, batch_norm, learning_rate) + model = get_unet( + img_rows, + img_cols, + n_layers, + filter_size, + dropout, + activation_func, + conv_size, + loss_func, + last_activation, + batch_norm, + learning_rate, + ) weights = initialize predict(model, weights, images) history_callback = None - + #### End model input ############################################################################################ - - return(history_callback) + + return history_callback + def main(): - print('Running main program...') + print("Running main program...") gParameters = initialize_parameters() run(gParameters) -if __name__ == '__main__': + +if __name__ == "__main__": main() try: K.clear_session() except AttributeError: - pass \ No newline at end of file + pass diff --git a/archives/templates/models/uno.py b/archives/templates/models/uno.py index 8a6ae339..a8c39555 100644 --- a/archives/templates/models/uno.py +++ b/archives/templates/models/uno.py @@ -1,10 +1,10 @@ #! /usr/bin/env python -#Note this file (model.py) is the same as that in Benchmarks/Pilot1/Uno/uno_baseline_keras2.py except with the following change:: +# Note this file (model.py) is the same as that in Benchmarks/Pilot1/Uno/uno_baseline_keras2.py except with the following change:: # -#- unoBmk = benchmark.BenchmarkUno(benchmark.file_path, 'uno_default_model.txt', 'keras', -#+ #mymodel_common = candle.Benchmark(file_path,os.getenv("DEFAULT_PARAMS_FILE"),'keras',prog='myprog',desc='My model') -#+ unoBmk = benchmark.BenchmarkUno(benchmark.file_path, os.getenv("DEFAULT_PARAMS_FILE"), 'keras', +# - unoBmk = benchmark.BenchmarkUno(benchmark.file_path, 'uno_default_model.txt', 'keras', +# + #mymodel_common = candle.Benchmark(file_path,os.getenv("DEFAULT_PARAMS_FILE"),'keras',prog='myprog',desc='My model') +# + unoBmk = benchmark.BenchmarkUno(benchmark.file_path, os.getenv("DEFAULT_PARAMS_FILE"), 'keras', from __future__ import division, print_function @@ -15,46 +15,50 @@ import random import threading +import keras + +# For non-interactive plotting +import matplotlib as mpl import numpy as np import pandas as pd - -import keras from keras import backend as K from keras import optimizers +from keras.callbacks import ( + Callback, + LearningRateScheduler, + ModelCheckpoint, + ReduceLROnPlateau, + TensorBoard, +) +from keras.layers import Dense, Dropout, Input from keras.models import Model -from keras.layers import Input, Dense, Dropout -from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler, TensorBoard from keras.utils import get_custom_objects from keras.utils.vis_utils import plot_model -from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error -from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold from scipy.stats.stats import pearsonr +from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score +from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold -# For non-interactive plotting -import matplotlib as mpl -mpl.use('Agg') +mpl.use("Agg") +import candle_keras as candle import matplotlib.pyplot as plt - import uno as benchmark -import candle_keras as candle - import uno_data -from uno_data import CombinedDataLoader, CombinedDataGenerator - +from uno_data import CombinedDataGenerator, CombinedDataLoader logger = logging.getLogger(__name__) -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" def set_seed(seed): - os.environ['PYTHONHASHSEED'] = '0' + os.environ["PYTHONHASHSEED"] = "0" np.random.seed(seed) random.seed(seed) - if K.backend() == 'tensorflow': + if K.backend() == "tensorflow": import tensorflow as tf + tf.set_random_seed(seed) # session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) # sess = tf.Session(graph=tf.get_default_graph(), config=session_conf) @@ -63,7 +67,7 @@ def set_seed(seed): # Uncommit when running on an optimized tensorflow where NUM_INTER_THREADS and # NUM_INTRA_THREADS env vars are set. # session_conf = tf.ConfigProto(inter_op_parallelism_threads=int(os.environ['NUM_INTER_THREADS']), - # intra_op_parallelism_threads=int(os.environ['NUM_INTRA_THREADS'])) + # intra_op_parallelism_threads=int(os.environ['NUM_INTRA_THREADS'])) # sess = tf.Session(graph=tf.get_default_graph(), config=session_conf) # K.set_session(sess) @@ -77,11 +81,13 @@ def verify_path(path): def set_up_logger(logfile, verbose): verify_path(logfile) fh = logging.FileHandler(logfile) - fh.setFormatter(logging.Formatter("[%(asctime)s %(process)d] %(message)s", datefmt="%Y-%m-%d %H:%M:%S")) + fh.setFormatter( + logging.Formatter("[%(asctime)s %(process)d] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S")) fh.setLevel(logging.DEBUG) sh = logging.StreamHandler() - sh.setFormatter(logging.Formatter('')) + sh.setFormatter(logging.Formatter("")) sh.setLevel(logging.DEBUG if verbose else logging.INFO) for log in [logger, uno_data.logger]: @@ -91,37 +97,37 @@ def set_up_logger(logfile, verbose): def extension_from_parameters(args): - """Construct string for saving model with annotation of parameters""" - ext = '' - ext += '.A={}'.format(args.activation) - ext += '.B={}'.format(args.batch_size) - ext += '.E={}'.format(args.epochs) - ext += '.O={}'.format(args.optimizer) + """Construct string for saving model with annotation of parameters.""" + ext = "" + ext += ".A={}".format(args.activation) + ext += ".B={}".format(args.batch_size) + ext += ".E={}".format(args.epochs) + ext += ".O={}".format(args.optimizer) # ext += '.LEN={}'.format(args.maxlen) - ext += '.LR={}'.format(args.learning_rate) - ext += '.CF={}'.format(''.join([x[0] for x in sorted(args.cell_features)])) - ext += '.DF={}'.format(''.join([x[0] for x in sorted(args.drug_features)])) + ext += ".LR={}".format(args.learning_rate) + ext += ".CF={}".format("".join([x[0] for x in sorted(args.cell_features)])) + ext += ".DF={}".format("".join([x[0] for x in sorted(args.drug_features)])) if args.feature_subsample > 0: - ext += '.FS={}'.format(args.feature_subsample) + ext += ".FS={}".format(args.feature_subsample) if args.drop > 0: - ext += '.DR={}'.format(args.drop) + ext += ".DR={}".format(args.drop) if args.warmup_lr: - ext += '.wu_lr' + ext += ".wu_lr" if args.reduce_lr: - ext += '.re_lr' + ext += ".re_lr" if args.residual: - ext += '.res' + ext += ".res" if args.use_landmark_genes: - ext += '.L1000' + ext += ".L1000" if args.no_gen: - ext += '.ng' + ext += ".ng" for i, n in enumerate(args.dense): if n > 0: - ext += '.D{}={}'.format(i+1, n) + ext += ".D{}={}".format(i + 1, n) if args.dense_feature_layers != args.dense: for i, n in enumerate(args.dense): if n > 0: - ext += '.FD{}={}'.format(i+1, n) + ext += ".FD{}={}".format(i + 1, n) return ext @@ -134,9 +140,9 @@ def discretize(y, bins=5): def r2(y_true, y_pred): - SS_res = K.sum(K.square(y_true - y_pred)) + SS_res = K.sum(K.square(y_true - y_pred)) SS_tot = K.sum(K.square(y_true - K.mean(y_true))) - return (1 - SS_res/(SS_tot + K.epsilon())) + return 1 - SS_res / (SS_tot + K.epsilon()) def mae(y_true, y_pred): @@ -148,56 +154,63 @@ def evaluate_prediction(y_true, y_pred): mae = mean_absolute_error(y_true, y_pred) r2 = r2_score(y_true, y_pred) corr, _ = pearsonr(y_true, y_pred) - return {'mse': mse, 'mae': mae, 'r2': r2, 'corr': corr} + return {"mse": mse, "mae": mae, "r2": r2, "corr": corr} -def log_evaluation(metric_outputs, description='Comparing y_true and y_pred:'): +def log_evaluation(metric_outputs, description="Comparing y_true and y_pred:"): logger.info(description) for metric, value in metric_outputs.items(): - logger.info(' {}: {:.4f}'.format(metric, value)) + logger.info(" {}: {:.4f}".format(metric, value)) -def plot_history(out, history, metric='loss', title=None): - title = title or 'model {}'.format(metric) - val_metric = 'val_{}'.format(metric) +def plot_history(out, history, metric="loss", title=None): + title = title or "model {}".format(metric) + val_metric = "val_{}".format(metric) plt.figure(figsize=(8, 6)) - plt.plot(history.history[metric], marker='o') - plt.plot(history.history[val_metric], marker='d') + plt.plot(history.history[metric], marker="o") + plt.plot(history.history[val_metric], marker="d") plt.title(title) plt.ylabel(metric) - plt.xlabel('epoch') - plt.legend(['train_{}'.format(metric), 'val_{}'.format(metric)], loc='upper center') - png = '{}.plot.{}.png'.format(out, metric) - plt.savefig(png, bbox_inches='tight') + plt.xlabel("epoch") + plt.legend(["train_{}".format(metric), "val_{}".format(metric)], + loc="upper center") + png = "{}.plot.{}.png".format(out, metric) + plt.savefig(png, bbox_inches="tight") class LoggingCallback(Callback): + def __init__(self, print_fcn=print): Callback.__init__(self) self.print_fcn = print_fcn def on_epoch_end(self, epoch, logs={}): - msg = "[Epoch: %i] %s" % (epoch, ", ".join("%s: %f" % (k, v) for k, v in sorted(logs.items()))) + msg = "[Epoch: %i] %s" % ( + epoch, + ", ".join("%s: %f" % (k, v) for k, v in sorted(logs.items())), + ) self.print_fcn(msg) class PermanentDropout(Dropout): + def __init__(self, rate, **kwargs): super(PermanentDropout, self).__init__(rate, **kwargs) self.uses_learning_phase = False def call(self, x, mask=None): - if 0. < self.rate < 1.: + if 0.0 < self.rate < 1.0: noise_shape = self._get_noise_shape(x) x = K.dropout(x, self.rate, noise_shape) return x class ModelRecorder(Callback): + def __init__(self, save_all_models=False): Callback.__init__(self) self.save_all_models = save_all_models - get_custom_objects()['PermanentDropout'] = PermanentDropout + get_custom_objects()["PermanentDropout"] = PermanentDropout def on_train_begin(self, logs={}): self.val_losses = [] @@ -205,16 +218,22 @@ def on_train_begin(self, logs={}): self.best_model = None def on_epoch_end(self, epoch, logs={}): - val_loss = logs.get('val_loss') + val_loss = logs.get("val_loss") self.val_losses.append(val_loss) if val_loss < self.best_val_loss: self.best_model = keras.models.clone_model(self.model) self.best_val_loss = val_loss -def build_feature_model(input_shape, name='', dense_layers=[1000, 1000], - activation='relu', residual=False, - dropout_rate=0, permanent_dropout=True): +def build_feature_model( + input_shape, + name="", + dense_layers=[1000, 1000], + activation="relu", + residual=False, + dropout_rate=0, + permanent_dropout=True, +): x_input = Input(shape=input_shape) h = x_input for i, layer in enumerate(dense_layers): @@ -238,13 +257,17 @@ def build_model(loader, args, permanent_dropout=True, silent=False): input_models = {} dropout_rate = args.drop for fea_type, shape in loader.feature_shapes.items(): - base_type = fea_type.split('.')[0] - if base_type in ['cell', 'drug']: - box = build_feature_model(input_shape=shape, name=fea_type, - dense_layers=args.dense_feature_layers, - dropout_rate=dropout_rate, permanent_dropout=permanent_dropout) + base_type = fea_type.split(".")[0] + if base_type in ["cell", "drug"]: + box = build_feature_model( + input_shape=shape, + name=fea_type, + dense_layers=args.dense_feature_layers, + dropout_rate=dropout_rate, + permanent_dropout=permanent_dropout, + ) if not silent: - logger.debug('Feature encoding submodel for %s:', fea_type) + logger.debug("Feature encoding submodel for %s:", fea_type) box.summary(print_fn=logger.debug) input_models[fea_type] = box @@ -252,7 +275,7 @@ def build_model(loader, args, permanent_dropout=True, silent=False): encoded_inputs = [] for fea_name, fea_type in loader.input_features.items(): shape = loader.feature_shapes[fea_type] - fea_input = Input(shape, name='input.'+fea_name) + fea_input = Input(shape, name="input." + fea_name) inputs.append(fea_input) if fea_type in input_models: input_model = input_models[fea_type] @@ -285,18 +308,25 @@ def build_model(loader, args, permanent_dropout=True, silent=False): def initialize_parameters(): # Build benchmark object - #mymodel_common = candle.Benchmark(file_path,os.getenv("DEFAULT_PARAMS_FILE"),'keras',prog='myprog',desc='My model') - unoBmk = benchmark.BenchmarkUno(benchmark.file_path, os.getenv("DEFAULT_PARAMS_FILE"), 'keras', - prog='uno_baseline', desc='Build neural network based models to predict tumor response to single and paired drugs.') - + # mymodel_common = candle.Benchmark(file_path,os.getenv("DEFAULT_PARAMS_FILE"),'keras',prog='myprog',desc='My model') + unoBmk = benchmark.BenchmarkUno( + benchmark.file_path, + os.getenv("DEFAULT_PARAMS_FILE"), + "keras", + prog="uno_baseline", + desc= + "Build neural network based models to predict tumor response to single and paired drugs.", + ) + # Initialize parameters gParameters = candle.initialize_parameters(unoBmk) - #benchmark.logger.info('Params: {}'.format(gParameters)) + # benchmark.logger.info('Params: {}'.format(gParameters)) return gParameters class Struct: + def __init__(self, **entries): self.__dict__.update(entries) @@ -307,79 +337,106 @@ def run(params): ext = extension_from_parameters(args) verify_path(args.save) prefix = args.save + ext - logfile = args.logfile if args.logfile else prefix+'.log' + logfile = args.logfile if args.logfile else prefix + ".log" set_up_logger(logfile, args.verbose) - logger.info('Params: {}'.format(params)) + logger.info("Params: {}".format(params)) loader = CombinedDataLoader(seed=args.rng_seed) - loader.load(cache=args.cache, - ncols=args.feature_subsample, - cell_features=args.cell_features, - drug_features=args.drug_features, - drug_median_response_min=args.drug_median_response_min, - drug_median_response_max=args.drug_median_response_max, - use_landmark_genes=args.use_landmark_genes, - use_filtered_genes=args.use_filtered_genes, - preprocess_rnaseq=args.preprocess_rnaseq, - single=args.single, - train_sources=args.train_sources, - test_sources=args.test_sources, - embed_feature_source=not args.no_feature_source, - encode_response_source=not args.no_response_source, - ) + loader.load( + cache=args.cache, + ncols=args.feature_subsample, + cell_features=args.cell_features, + drug_features=args.drug_features, + drug_median_response_min=args.drug_median_response_min, + drug_median_response_max=args.drug_median_response_max, + use_landmark_genes=args.use_landmark_genes, + use_filtered_genes=args.use_filtered_genes, + preprocess_rnaseq=args.preprocess_rnaseq, + single=args.single, + train_sources=args.train_sources, + test_sources=args.test_sources, + embed_feature_source=not args.no_feature_source, + encode_response_source=not args.no_response_source, + ) val_split = args.validation_split train_split = 1 - val_split if args.export_data: fname = args.export_data - loader.partition_data(cv_folds=args.cv, train_split=train_split, val_split=val_split, - cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug) - train_gen = CombinedDataGenerator(loader, batch_size=args.batch_size, shuffle=args.shuffle) - val_gen = CombinedDataGenerator(loader, partition='val', batch_size=args.batch_size, shuffle=args.shuffle) - x_train_list, y_train = train_gen.get_slice(size=train_gen.size, dataframe=True, single=args.single) - x_val_list, y_val = val_gen.get_slice(size=val_gen.size, dataframe=True, single=args.single) + loader.partition_data( + cv_folds=args.cv, + train_split=train_split, + val_split=val_split, + cell_types=args.cell_types, + by_cell=args.by_cell, + by_drug=args.by_drug, + ) + train_gen = CombinedDataGenerator(loader, + batch_size=args.batch_size, + shuffle=args.shuffle) + val_gen = CombinedDataGenerator(loader, + partition="val", + batch_size=args.batch_size, + shuffle=args.shuffle) + x_train_list, y_train = train_gen.get_slice(size=train_gen.size, + dataframe=True, + single=args.single) + x_val_list, y_val = val_gen.get_slice(size=val_gen.size, + dataframe=True, + single=args.single) df_train = pd.concat([y_train] + x_train_list, axis=1) df_val = pd.concat([y_val] + x_val_list, axis=1) df = pd.concat([df_train, df_val]).reset_index(drop=True) if args.growth_bins > 1: - df = uno_data.discretize(df, 'Growth', bins=args.growth_bins) - df.to_csv(fname, sep='\t', index=False, float_format="%.3g") + df = uno_data.discretize(df, "Growth", bins=args.growth_bins) + df.to_csv(fname, sep="\t", index=False, float_format="%.3g") return - loader.partition_data(cv_folds=args.cv, train_split=train_split, val_split=val_split, - cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug) + loader.partition_data( + cv_folds=args.cv, + train_split=train_split, + val_split=val_split, + cell_types=args.cell_types, + by_cell=args.by_cell, + by_drug=args.by_drug, + ) model = build_model(loader, args) - logger.info('Combined model:') + logger.info("Combined model:") model.summary(print_fn=logger.info) # plot_model(model, to_file=prefix+'.model.png', show_shapes=True) if args.cp: model_json = model.to_json() - with open(prefix+'.model.json', 'w') as f: + with open(prefix + ".model.json", "w") as f: print(model_json, file=f) def warmup_scheduler(epoch): - lr = args.learning_rate or base_lr * args.batch_size/100 + lr = args.learning_rate or base_lr * args.batch_size / 100 if epoch <= 5: - K.set_value(model.optimizer.lr, (base_lr * (5-epoch) + lr * epoch) / 5) - logger.debug('Epoch {}: lr={:.5g}'.format(epoch, K.get_value(model.optimizer.lr))) + K.set_value(model.optimizer.lr, + (base_lr * (5 - epoch) + lr * epoch) / 5) + logger.debug("Epoch {}: lr={:.5g}".format( + epoch, K.get_value(model.optimizer.lr))) return K.get_value(model.optimizer.lr) df_pred_list = [] - cv_ext = '' + cv_ext = "" cv = args.cv if args.cv > 1 else 1 for fold in range(cv): if args.cv > 1: - logger.info('Cross validation fold {}/{}:'.format(fold+1, cv)) - cv_ext = '.cv{}'.format(fold+1) + logger.info("Cross validation fold {}/{}:".format(fold + 1, cv)) + cv_ext = ".cv{}".format(fold + 1) model = build_model(loader, args, silent=True) - optimizer = optimizers.deserialize({'class_name': args.optimizer, 'config': {}}) + optimizer = optimizers.deserialize({ + "class_name": args.optimizer, + "config": {} + }) base_lr = args.base_lr or K.get_value(optimizer.lr) if args.learning_rate: K.set_value(optimizer.lr, args.learning_rate) @@ -390,17 +447,24 @@ def warmup_scheduler(epoch): params.update(candle.compute_trainable_params(model)) candle_monitor = candle.CandleRemoteMonitor(params=params) - timeout_monitor = candle.TerminateOnTimeOut(params['timeout']) + timeout_monitor = candle.TerminateOnTimeOut(params["timeout"]) - reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001) + reduce_lr = ReduceLROnPlateau(monitor="val_loss", + factor=0.5, + patience=5, + min_lr=0.00001) warmup_lr = LearningRateScheduler(warmup_scheduler) - checkpointer = ModelCheckpoint(prefix+cv_ext+'.weights.h5', save_best_only=True, save_weights_only=True) + checkpointer = ModelCheckpoint(prefix + cv_ext + ".weights.h5", + save_best_only=True, + save_weights_only=True) tensorboard = TensorBoard(log_dir="tb/tb{}{}".format(ext, cv_ext)) history_logger = LoggingCallback(logger.debug) model_recorder = ModelRecorder() # callbacks = [history_logger, model_recorder] - callbacks = [candle_monitor, timeout_monitor, history_logger, model_recorder] + callbacks = [ + candle_monitor, timeout_monitor, history_logger, model_recorder + ] if args.reduce_lr: callbacks.append(reduce_lr) if args.warmup_lr: @@ -410,41 +474,66 @@ def warmup_scheduler(epoch): if args.tb: callbacks.append(tensorboard) - train_gen = CombinedDataGenerator(loader, fold=fold, batch_size=args.batch_size, shuffle=args.shuffle) - val_gen = CombinedDataGenerator(loader, partition='val', fold=fold, batch_size=args.batch_size, shuffle=args.shuffle) + train_gen = CombinedDataGenerator(loader, + fold=fold, + batch_size=args.batch_size, + shuffle=args.shuffle) + val_gen = CombinedDataGenerator( + loader, + partition="val", + fold=fold, + batch_size=args.batch_size, + shuffle=args.shuffle, + ) df_val = val_gen.get_response(copy=True) - y_val = df_val['Growth'].values + y_val = df_val["Growth"].values y_shuf = np.random.permutation(y_val) - log_evaluation(evaluate_prediction(y_val, y_shuf), - description='Between random pairs in y_val:') + log_evaluation( + evaluate_prediction(y_val, y_shuf), + description="Between random pairs in y_val:", + ) if args.no_gen: - x_train_list, y_train = train_gen.get_slice(size=train_gen.size, single=args.single) - x_val_list, y_val = val_gen.get_slice(size=val_gen.size, single=args.single) - history = model.fit(x_train_list, y_train, - batch_size=args.batch_size, - epochs=args.epochs, - callbacks=callbacks, - validation_data=(x_val_list, y_val)) + x_train_list, y_train = train_gen.get_slice(size=train_gen.size, + single=args.single) + x_val_list, y_val = val_gen.get_slice(size=val_gen.size, + single=args.single) + history = model.fit( + x_train_list, + y_train, + batch_size=args.batch_size, + epochs=args.epochs, + callbacks=callbacks, + validation_data=(x_val_list, y_val), + ) else: - logger.info('Data points per epoch: train = %d, val = %d',train_gen.size, val_gen.size) - logger.info('Steps per epoch: train = %d, val = %d',train_gen.steps, val_gen.steps) - history = model.fit_generator(train_gen.flow(single=args.single), train_gen.steps, - epochs=args.epochs, - callbacks=callbacks, - validation_data=val_gen.flow(single=args.single), - validation_steps=val_gen.steps) + logger.info( + "Data points per epoch: train = %d, val = %d", + train_gen.size, + val_gen.size, + ) + logger.info("Steps per epoch: train = %d, val = %d", + train_gen.steps, val_gen.steps) + history = model.fit_generator( + train_gen.flow(single=args.single), + train_gen.steps, + epochs=args.epochs, + callbacks=callbacks, + validation_data=val_gen.flow(single=args.single), + validation_steps=val_gen.steps, + ) if args.cp: - model.load_weights(prefix+cv_ext+'.weights.h5') + model.load_weights(prefix + cv_ext + ".weights.h5") # model = model_recorder.best_model if args.no_gen: y_val_pred = model.predict(x_val_list, batch_size=args.batch_size) else: val_gen.reset() - y_val_pred = model.predict_generator(val_gen.flow(single=args.single), val_gen.steps) + y_val_pred = model.predict_generator( + val_gen.flow(single=args.single), val_gen.steps) y_val_pred = y_val_pred[:val_gen.size] y_val_pred = y_val_pred.flatten() @@ -452,39 +541,52 @@ def warmup_scheduler(epoch): scores = evaluate_prediction(y_val, y_val_pred) log_evaluation(scores) - df_val = df_val.assign(PredictedGrowth=y_val_pred, GrowthError=y_val_pred-y_val) + df_val = df_val.assign(PredictedGrowth=y_val_pred, + GrowthError=y_val_pred - y_val) df_pred_list.append(df_val) - plot_history(prefix, history, 'loss') - plot_history(prefix, history, 'r2') + plot_history(prefix, history, "loss") + plot_history(prefix, history, "r2") - pred_fname = prefix + '.predicted.tsv' + pred_fname = prefix + ".predicted.tsv" df_pred = pd.concat(df_pred_list) - df_pred.sort_values(['Source', 'Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'], inplace=True) - df_pred.to_csv(pred_fname, sep='\t', index=False, float_format='%.4g') + df_pred.sort_values( + ["Source", "Sample", "Drug1", "Drug2", "Dose1", "Dose2", "Growth"], + inplace=True) + df_pred.to_csv(pred_fname, sep="\t", index=False, float_format="%.4g") if args.cv > 1: - scores = evaluate_prediction(df_pred['Growth'], df_pred['PredictedGrowth']) - log_evaluation(scores, description='Combining cross validation folds:') + scores = evaluate_prediction(df_pred["Growth"], + df_pred["PredictedGrowth"]) + log_evaluation(scores, description="Combining cross validation folds:") for test_source in loader.test_sep_sources: - test_gen = CombinedDataGenerator(loader, partition='test', batch_size=args.batch_size, source=test_source) + test_gen = CombinedDataGenerator(loader, + partition="test", + batch_size=args.batch_size, + source=test_source) df_test = test_gen.get_response(copy=True) - y_test = df_test['Growth'].values + y_test = df_test["Growth"].values n_test = len(y_test) if n_test == 0: continue if args.no_gen: - x_test_list, y_test = test_gen.get_slice(size=test_gen.size, single=args.single) + x_test_list, y_test = test_gen.get_slice(size=test_gen.size, + single=args.single) y_test_pred = model.predict(x_test_list, batch_size=args.batch_size) else: - y_test_pred = model.predict_generator(test_gen.flow(single=args.single), test_gen.steps) + y_test_pred = model.predict_generator( + test_gen.flow(single=args.single), test_gen.steps) y_test_pred = y_test_pred[:test_gen.size] y_test_pred = y_test_pred.flatten() scores = evaluate_prediction(y_test, y_test_pred) - log_evaluation(scores, description='Testing on data from {} ({})'.format(test_source, n_test)) + log_evaluation( + scores, + description="Testing on data from {} ({})".format( + test_source, n_test), + ) - if K.backend() == 'tensorflow': + if K.backend() == "tensorflow": K.clear_session() logger.handlers = [] @@ -497,7 +599,7 @@ def main(): run(params) -if __name__ == '__main__': +if __name__ == "__main__": main() - if K.backend() == 'tensorflow': + if K.backend() == "tensorflow": K.clear_session() diff --git a/archives/templates/models/wrapper_compliant/mnist_mlp.py b/archives/templates/models/wrapper_compliant/mnist_mlp.py index c059d1c4..06f11171 100644 --- a/archives/templates/models/wrapper_compliant/mnist_mlp.py +++ b/archives/templates/models/wrapper_compliant/mnist_mlp.py @@ -1,45 +1,46 @@ # Run the wrapper_connector script, which (1) appends $SUPP_PYTHONPATH to the Python environment if it's defined and (2) defines the function for loading the hyperparameters -import sys, os -sys.path.append(os.getenv("CANDLE")+'/Supervisor/templates/scripts') +import os +import sys + +sys.path.append(os.getenv("CANDLE") + "/Supervisor/templates/scripts") import wrapper_connector -gParameters = wrapper_connector.load_params('params.json') -################ ADD MODEL BELOW USING gParameters DICTIONARY AS CURRENT HYPERPARAMETER SET; DO NOT MODIFY ABOVE ####################################### +gParameters = wrapper_connector.load_params("params.json") +################ ADD MODEL BELOW USING gParameters DICTIONARY AS CURRENT HYPERPARAMETER SET; DO NOT MODIFY ABOVE ####################################### ########################################## # Your DL start here. See mnist_mlp.py # ########################################## -'''Trains a simple deep NN on the MNIST dataset. +# Trains a simple deep NN on the MNIST dataset. -Gets to 98.40% test accuracy after 20 epochs -(there is *a lot* of margin for parameter tuning). -2 seconds per epoch on a K520 GPU. -''' +# Gets to 98.40% test accuracy after 20 epochs +# (there is *a lot* of margin for parameter tuning). +# 2 seconds per epoch on a K520 GPU. import keras from keras.datasets import mnist -from keras.models import Sequential from keras.layers import Dense, Dropout +from keras.models import Sequential from keras.optimizers import RMSprop -batch_size = gParameters['batch_size'] +batch_size = gParameters["batch_size"] num_classes = 10 -epochs = gParameters['epochs'] +epochs = gParameters["epochs"] -activation = gParameters['activation'] -optimizer = gParameters['optimizer'] +activation = gParameters["activation"] +optimizer = gParameters["optimizer"] # the data, split between train and test sets (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train = x_train.reshape(60000, 784) x_test = x_test.reshape(10000, 784) -x_train = x_train.astype('float32') -x_test = x_test.astype('float32') +x_train = x_train.astype("float32") +x_test = x_test.astype("float32") x_train /= 255 x_test /= 255 -print(x_train.shape[0], 'train samples') -print(x_test.shape[0], 'test samples') +print(x_train.shape[0], "train samples") +print(x_test.shape[0], "test samples") # convert class vectors to binary class matrices y_train = keras.utils.to_categorical(y_train, num_classes) @@ -50,36 +51,43 @@ model.add(Dropout(0.2)) model.add(Dense(512, activation=activation)) model.add(Dropout(0.2)) -model.add(Dense(num_classes, activation='softmax')) +model.add(Dense(num_classes, activation="softmax")) model.summary() -model.compile(loss='categorical_crossentropy', - optimizer=optimizer, - metrics=['accuracy']) +model.compile(loss="categorical_crossentropy", + optimizer=optimizer, + metrics=["accuracy"]) -history = model.fit(x_train, y_train, - batch_size=batch_size, - epochs=epochs, - verbose=1, - validation_data=(x_test, y_test)) +history = model.fit( + x_train, + y_train, + batch_size=batch_size, + epochs=epochs, + verbose=1, + validation_data=(x_test, y_test), +) score = model.evaluate(x_test, y_test, verbose=0) -print('Test loss:', score[0]) -print('Test accuracy:', score[1]) +print("Test loss:", score[0]) +print("Test accuracy:", score[1]) ########################################## # End of mnist_mlp.py #################### ########################################## - ################ ADD MODEL ABOVE USING gParameters DICTIONARY AS CURRENT HYPERPARAMETER SET; DO NOT MODIFY BELOW ####################################### # Ensure that above you DEFINE the history object (as in, e.g., the return value of model.fit()) or val_to_return (a single number) in your model; below we essentially RETURN those values -try: history +try: + history except NameError: - try: val_to_return + try: + val_to_return except NameError: - print("Error: Neither a history object nor a val_to_return variable was defined upon running the model on the current hyperparameter set; exiting") + print( + "Error: Neither a history object nor a val_to_return variable was defined upon running the model on the current hyperparameter set; exiting" + ) exit else: - wrapper_connector.write_history_from_value(val_to_return, 'val_to_return.json') + wrapper_connector.write_history_from_value(val_to_return, + "val_to_return.json") else: - wrapper_connector.write_history(history, 'val_to_return.json') \ No newline at end of file + wrapper_connector.write_history(history, "val_to_return.json") diff --git a/archives/templates/run_without_candle.sh b/archives/templates/run_without_candle.sh index f4d5f170..84e5f3cf 100755 --- a/archives/templates/run_without_candle.sh +++ b/archives/templates/run_without_candle.sh @@ -17,4 +17,4 @@ module load python/3.6 export DEFAULT_PARAMS_FILE="$CANDLE/Supervisor/templates/model_params/mnist1.txt" # Run the model -python $CANDLE/Supervisor/templates/models/mnist/mnist_mlp.py \ No newline at end of file +python $CANDLE/Supervisor/templates/models/mnist/mnist_mlp.py diff --git a/archives/templates/scripts/candle_compliant_wrapper.py b/archives/templates/scripts/candle_compliant_wrapper.py index 815eb833..d426deff 100644 --- a/archives/templates/scripts/candle_compliant_wrapper.py +++ b/archives/templates/scripts/candle_compliant_wrapper.py @@ -1,59 +1,91 @@ # This file should generally follow the standard CANDLE-compliance procedure + def initialize_parameters(): # Add the candle_keras library to the Python path - import sys, os - sys.path.append(os.getenv("CANDLE")+'/Candle/common') + import os + import sys + + sys.path.append(os.getenv("CANDLE") + "/Candle/common") # Instantiate the Benchmark class (the values of the prog and desc parameters don't really matter) import candle_keras as candle - mymodel_common = candle.Benchmark(os.path.dirname(os.path.realpath(__file__)), os.getenv("DEFAULT_PARAMS_FILE"), 'keras', prog='myprogram', desc='My CANDLE example') + + mymodel_common = candle.Benchmark( + os.path.dirname(os.path.realpath(__file__)), + os.getenv("DEFAULT_PARAMS_FILE"), + "keras", + prog="myprogram", + desc="My CANDLE example", + ) # Read the parameters (in a dictionary format) pointed to by the environment variable DEFAULT_PARAMS_FILE gParameters = candle.initialize_parameters(mymodel_common) # Return this dictionary of parameters - return(gParameters) + return gParameters + def run(gParameters): # Define the dummy history class; defining it here to keep this file aligned with the standard CANDLE-compliance procedure class HistoryDummy: + def __init__(self, mynum): - self.history = {'val_loss': [mynum], 'val_corr': [mynum], 'val_dice_coef': [mynum]} + self.history = { + "val_loss": [mynum], + "val_corr": [mynum], + "val_dice_coef": [mynum], + } # Reformat a value that doesn't have an analogous field in the JSON format - gParameters['datatype'] = str(gParameters['datatype']) + gParameters["datatype"] = str(gParameters["datatype"]) # Write the current set of hyperparameters to a JSON file import json - with open('params.json', 'w') as outfile: + + with open("params.json", "w") as outfile: json.dump(gParameters, outfile) # Run the wrapper script model_wrapper.sh where the environment is defined and the model (whether in Python or R) is called - myfile = open('subprocess_out_and_err.txt','w') - import subprocess, os - print('Starting run of model_wrapper.sh from candle_compliant_wrapper.py...') - subprocess.run(['bash', os.getenv("CANDLE")+'/Supervisor/templates/scripts/model_wrapper.sh'], stdout=myfile, stderr=subprocess.STDOUT) - print('Finished run of model_wrapper.sh from candle_compliant_wrapper.py') + myfile = open("subprocess_out_and_err.txt", "w") + import os + import subprocess + + print( + "Starting run of model_wrapper.sh from candle_compliant_wrapper.py...") + subprocess.run( + [ + "bash", + os.getenv("CANDLE") + + "/Supervisor/templates/scripts/model_wrapper.sh", + ], + stdout=myfile, + stderr=subprocess.STDOUT, + ) + print("Finished run of model_wrapper.sh from candle_compliant_wrapper.py") myfile.close() # Read in the history.history dictionary containing the result from the JSON file created by the model history = HistoryDummy(4444) import json - with open('val_to_return.json') as infile: + + with open("val_to_return.json") as infile: history.history = json.load(infile) - return(history) - + return history + + def main(): gParameters = initialize_parameters() run(gParameters) -if __name__ == '__main__': + +if __name__ == "__main__": main() try: from keras import backend as K + K.clear_session() except AttributeError: - pass \ No newline at end of file + pass diff --git a/archives/templates/scripts/copy_candle_template b/archives/templates/scripts/copy_candle_template index 46137fdf..29046a1e 100755 --- a/archives/templates/scripts/copy_candle_template +++ b/archives/templates/scripts/copy_candle_template @@ -33,4 +33,4 @@ if [ "a$ret1" == "a0" ] && [ "a$ret2" == "a0" ]; then echo -e " (2) First modify $submission_script using https://cbiit.github.io/fnlcr-bids-hpc/documentation/candle/how_to_modify_the_candle_templates as a guide and then submit your own CANDLE job by running './$submission_script' (no 'sbatch' needed)\n" else echo -e "\nAn error occurred; see error message(s) above\n" -fi \ No newline at end of file +fi diff --git a/archives/templates/scripts/copy_candle_template-new b/archives/templates/scripts/copy_candle_template-new index 79960adb..8effbdf0 100755 --- a/archives/templates/scripts/copy_candle_template-new +++ b/archives/templates/scripts/copy_candle_template-new @@ -33,4 +33,4 @@ if [ "a$ret1" == "a0" ] && [ "a$ret2" == "a0" ]; then echo -e " (2) First modify $submission_script using https://cbiit.github.io/fnlcr-bids-hpc/documentation/candle/how_to_modify_the_candle_templates as a guide and then submit your own CANDLE job by running './$submission_script' (no 'sbatch' needed)\n" else echo -e "\nAn error occurred; see error message(s) above\n" -fi \ No newline at end of file +fi diff --git a/archives/templates/scripts/model_wrapper.sh b/archives/templates/scripts/model_wrapper.sh index e31e0300..78903174 100644 --- a/archives/templates/scripts/model_wrapper.sh +++ b/archives/templates/scripts/model_wrapper.sh @@ -43,4 +43,4 @@ elif [ "x$suffix" == "xr" ]; then fi # Display timing information -echo "MODEL_WRAPPER.SH END TIME: $(date +%s)" \ No newline at end of file +echo "MODEL_WRAPPER.SH END TIME: $(date +%s)" diff --git a/archives/templates/scripts/restart.py b/archives/templates/scripts/restart.py index 9a5b89c3..7b791fa9 100644 --- a/archives/templates/scripts/restart.py +++ b/archives/templates/scripts/restart.py @@ -1,9 +1,9 @@ -import os import datetime -import pandas as pd -import numpy as np import json +import os +import numpy as np +import pandas as pd result_file = "result.txt" params_log = "params.json" @@ -11,55 +11,59 @@ objective_str = "objective" eval_dir = "eval_dir" config_json = "configuration.json" -TIME_FORMAT='%Y-%m-%d %H:%M:%S' +TIME_FORMAT = "%Y-%m-%d %H:%M:%S" start = "start_time" stop = "stop_time" -eval_key = 'id' +eval_key = "id" exp_dir = "EXPERIMENTS" upf_space = "WORKFLOW_SETTINGS_FILE" - + def grep(model_log): """ Parse the log file to generate the start and stop times - Arguments: + Arguments: model_log: filepath The log file for the evaluation returns: dict Dictionary with start and stop times. - + """ import subprocess global TIME_FORMAT global start - global stop - - output = subprocess.check_output(['grep', '-E', "RUN START|RUN STOP", model_log]) + global stop + + output = subprocess.check_output( + ["grep", "-E", "RUN START|RUN STOP", model_log]) lines = output.decode("utf-8") result = {} - for line in lines.split('\n'): - idx = line.find(' __main') + for line in lines.split("\n"): + idx = line.find(" __main") if idx != -1: ts = line[0:idx] dt = datetime.datetime.strptime(ts, TIME_FORMAT).timestamp() - if line.endswith('START'): + if line.endswith("START"): result[start] = dt else: result[stop] = dt - + return result + def get_immediate_subdirectories(a_dir): - return [name for name in os.listdir(a_dir) - if os.path.isdir(os.path.join(a_dir, name))] + return [ + name for name in os.listdir(a_dir) + if os.path.isdir(os.path.join(a_dir, name)) + ] def get_successful_evaluations(all_eval): """ Returns a data frame with the evaluations that run successfully only Arguments - all_eval: dataframe + all_eval: dataframe Dataframe that includes all evaluations Returns: @@ -67,49 +71,54 @@ def get_successful_evaluations(all_eval): """ global objective_str - #For now return all evaluations that a result value. + # For now return all evaluations that a result value. u = ~all_eval[objective_str].isnull() - return all_eval[u] + return all_eval[u] + def get_remaining_evaluations(upf_file, all_eval): - """ - Generate a upf file with that contains all the evaluations that did not - complete successuflly - + """Generate a upf file with that contains all the evaluations that did not + complete successuflly. + Arguments: - upf_file: filename + upf_file: filename The orignial file that contains the parameter space all_eval: dataframe The dataframe that has attemped simulation parameters - + Return: str A str that contains information the upf info for the configuration that did not complete """ - #Read and parse the originla upf + # Read and parse the originla upf global eval_key if os.path.exists(upf_file): - with open(upf_file, 'r') as upf: - upf_str = upf.read() - else: + with open(upf_file, "r") as upf: + upf_str = upf.read() + else: raise Exception("The upf file {} does not exist".format(upf_file)) - #parse the upf string to a list of dictionaries - lines = upf_str.split('\n') - lines = [l for l in lines if l.strip() != ''] + # parse the upf string to a list of dictionaries + lines = upf_str.split("\n") + lines = [l for l in lines if l.strip() != ""] params = [] for configuration in lines: params.append(eval(configuration)) - + total_ids = set([x[eval_key] for x in params]) - success_eval_df = get_successful_evaluations(all_eval) + success_eval_df = get_successful_evaluations(all_eval) success_ids = set(success_eval_df[eval_key].tolist()) remaining_ids = total_ids.difference(success_ids) - new_upf = [json.dumps(config) for config in params if config[eval_key] in remaining_ids] + new_upf = [ + json.dumps(config) + for config in params + if config[eval_key] in remaining_ids + ] return "\n".join(new_upf) + def all_runs_log(exp_dir): """ Gather information about all the runs in an experiment @@ -117,24 +126,25 @@ def all_runs_log(exp_dir): exp_dir: str Path to the experiment directory - Returns: Dataframe - Every evaluation will occupy a row + Returns: Dataframe + Every evaluation will occupy a row """ eval_list = [] - launch_dirs = get_immediate_subdirectories(exp_dir) - for launch in launch_dirs: - run_dir = os.path.join(exp_dir, launch, "run") - #print(run_dir) + launch_dirs = get_immediate_subdirectories(exp_dir) + for launch in launch_dirs: + run_dir = os.path.join(exp_dir, launch, "run") + # print(run_dir) eval_dirs = get_immediate_subdirectories(run_dir) for evaluation in eval_dirs: eval_dir = os.path.join(run_dir, evaluation) eval_dic = single_evaluation_log(eval_dir) - eval_list.append(pd.Series(eval_dic, index = eval_dic.keys())) + eval_list.append(pd.Series(eval_dic, index=eval_dic.keys())) df = pd.DataFrame(eval_list) return df + def single_evaluation_log(evaluation_dir): """ Checks if the an evaluation is successful and generate evaluation parameters @@ -146,38 +156,38 @@ def single_evaluation_log(evaluation_dir): Dictionary with all the parameters of the evaluation and the objective value """ - global result_file - global params_log - global eval_log - global objective_str + global result_file + global params_log + global eval_log + global objective_str global eval_dir - global config_json + global config_json eval_dic = {} - #See if evaluation completed successfully if resutls.txt contains a float + # See if evaluation completed successfully if resutls.txt contains a float result_path = os.path.join(evaluation_dir, result_file) if not os.path.exists(result_path): obj_value = np.nan else: - with open(result_path,mode='r') as result: + with open(result_path, mode="r") as result: obj_str = result.read() try: obj_value = float(obj_str) except Exception as e: - obj_value = np.nan + obj_value = np.nan eval_dic[objective_str] = obj_value - #Read the parameters dictionary + # Read the parameters dictionary params_path = os.path.join(evaluation_dir, params_log) if os.path.exists(params_path): - with open(params_path, 'r') as f: + with open(params_path, "r") as f: model_params = json.load(f) eval_dic.update(model_params) - #Read the timing metadata + # Read the timing metadata model_log = os.path.join(evaluation_dir, eval_log) if os.path.exists(model_log): timing_dic = grep(model_log) @@ -185,14 +195,17 @@ def single_evaluation_log(evaluation_dir): return eval_dic + if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser(description = 'Restart a UPF experiment') - parser.add_argument('submit_args', help='The biowulf submission configuration') + parser = argparse.ArgumentParser(description="Restart a UPF experiment") + + parser.add_argument("submit_args", + help="The biowulf submission configuration") args = parser.parse_args() - with open(args.submit_args) as json_file: + with open(args.submit_args) as json_file: config_json = json.load(json_file) experiment = config_json[exp_dir] @@ -201,4 +214,4 @@ def single_evaluation_log(evaluation_dir): status = all_runs_log(experiment) new_upf = get_remaining_evaluations(upf_file, status) if new_upf != "": - print(new_upf) \ No newline at end of file + print(new_upf) diff --git a/archives/templates/scripts/run_without_candle.sh b/archives/templates/scripts/run_without_candle.sh index 4b026f81..c79adc26 100755 --- a/archives/templates/scripts/run_without_candle.sh +++ b/archives/templates/scripts/run_without_candle.sh @@ -12,4 +12,4 @@ #SBATCH --job-name=mnist_test_no_candle export USE_CANDLE=0 -./submit_candle_job.sh \ No newline at end of file +./submit_candle_job.sh diff --git a/archives/templates/scripts/run_workflows.sh b/archives/templates/scripts/run_workflows.sh index c040d4d0..014cf29f 100755 --- a/archives/templates/scripts/run_workflows.sh +++ b/archives/templates/scripts/run_workflows.sh @@ -109,4 +109,4 @@ if [ "${USE_CANDLE:-1}" -eq 1 ]; then # ...otherwise, run the wrapper alone, outside of CANDLE else python "$MODEL_PYTHON_DIR/$MODEL_PYTHON_SCRIPT.py" -fi \ No newline at end of file +fi diff --git a/archives/templates/scripts/submit_candle_job.sh b/archives/templates/scripts/submit_candle_job.sh index 80a6a444..566db7aa 100755 --- a/archives/templates/scripts/submit_candle_job.sh +++ b/archives/templates/scripts/submit_candle_job.sh @@ -32,4 +32,4 @@ export USE_CANDLE=1 # if not already set, as in e.g. by run_without_candle.sh, s ################ MODIFY ONLY ABOVE; DO NOT MODIFY BELOW #################################################################### -$CANDLE/Supervisor/templates/scripts/run_workflows.sh \ No newline at end of file +$CANDLE/Supervisor/templates/scripts/run_workflows.sh diff --git a/archives/templates/scripts/wrapper_connector.py b/archives/templates/scripts/wrapper_connector.py index e1e3ca92..eb44d81e 100644 --- a/archives/templates/scripts/wrapper_connector.py +++ b/archives/templates/scripts/wrapper_connector.py @@ -1,22 +1,36 @@ # If it's defined in the environment, append $SUPP_PYTHONPATH to the Python path -import os, json -supp_pythonpath = os.getenv('SUPP_PYTHONPATH') +import json +import os + +supp_pythonpath = os.getenv("SUPP_PYTHONPATH") if supp_pythonpath is not None: import sys + sys.path.append(supp_pythonpath) # Load the hyperparameter dictionary stored in the JSON file params.json def load_params(params_json_file): with open(params_json_file) as infile: - return(json.load(infile)) + return json.load(infile) + # Write the history.history dictionary to a JSON file def write_history(history, val_to_return_json_file): - with open(val_to_return_json_file, 'w') as outfile: + with open(val_to_return_json_file, "w") as outfile: json.dump(history.history, outfile) + # Make a history.history dictionary from a return value and write it to a JSON file -def write_history_from_value(val_to_return, val_to_return_json_file): # val_to_return_json_file should be val_to_return.json to match the value in candle_compliant_wrapper.py - with open(val_to_return_json_file, 'w') as outfile: - json.dump({'val_loss': [val_to_return], 'val_corr': [val_to_return], 'val_dice_coef': [val_to_return]}, outfile) \ No newline at end of file +def write_history_from_value( + val_to_return, val_to_return_json_file +): # val_to_return_json_file should be val_to_return.json to match the value in candle_compliant_wrapper.py + with open(val_to_return_json_file, "w") as outfile: + json.dump( + { + "val_loss": [val_to_return], + "val_corr": [val_to_return], + "val_dice_coef": [val_to_return], + }, + outfile, + ) diff --git a/archives/templates/workflow_settings/mlrmbo1.sh b/archives/templates/workflow_settings/mlrmbo1.sh index fba6629f..851a4f29 100644 --- a/archives/templates/workflow_settings/mlrmbo1.sh +++ b/archives/templates/workflow_settings/mlrmbo1.sh @@ -5,4 +5,4 @@ MAX_CONCURRENT_EVALUATIONS=${MAX_CONCURRENT_EVALUATIONS:-1} MAX_ITERATIONS=${MAX_ITERATIONS:-3} MAX_BUDGET=${MAX_BUDGET:-180} DESIGN_SIZE=${DESIGN_SIZE:-9} -PARAM_SET_FILE=${PARAM_SET_FILE:-$CANDLE/Supervisor/workflows/$WORKFLOW_TYPE/data/nt3_hps_exp_01.R} \ No newline at end of file +PARAM_SET_FILE=${PARAM_SET_FILE:-$CANDLE/Supervisor/workflows/$WORKFLOW_TYPE/data/nt3_hps_exp_01.R} diff --git a/archives/templates/workflow_settings/upf-default.txt b/archives/templates/workflow_settings/upf-default.txt index 2e78b091..c70294d7 100644 --- a/archives/templates/workflow_settings/upf-default.txt +++ b/archives/templates/workflow_settings/upf-default.txt @@ -1 +1 @@ -{"id": "default_run"} \ No newline at end of file +{"id": "default_run"} diff --git a/archives/templates/workflow_settings/upf1.txt b/archives/templates/workflow_settings/upf1.txt index 3607213d..cabc1dc0 100644 --- a/archives/templates/workflow_settings/upf1.txt +++ b/archives/templates/workflow_settings/upf1.txt @@ -1 +1 @@ -{"id": "mytest", "batch_size": 2048, "learning_rate": 0.00001} \ No newline at end of file +{"id": "mytest", "batch_size": 2048, "learning_rate": 0.00001} diff --git a/archives/templates/workflow_settings/upf3.txt b/archives/templates/workflow_settings/upf3.txt index aafae5db..76b9a800 100644 --- a/archives/templates/workflow_settings/upf3.txt +++ b/archives/templates/workflow_settings/upf3.txt @@ -4,4 +4,4 @@ {"id": "hpset_04", "epochs": 30, "activation": "relu"} {"id": "hpset_05", "epochs": 10, "batch_size": 128} {"id": "hpset_06", "epochs": 10, "batch_size": 256} -{"id": "hpset_07", "epochs": 10, "batch_size": 512} \ No newline at end of file +{"id": "hpset_07", "epochs": 10, "batch_size": 512} diff --git a/archives/workflows/auen41_ff/auen41_ff.py b/archives/workflows/auen41_ff/auen41_ff.py index 0236dbe5..b757ad36 100644 --- a/archives/workflows/auen41_ff/auen41_ff.py +++ b/archives/workflows/auen41_ff/auen41_ff.py @@ -1,27 +1,27 @@ -import pandas as pd -import numpy as np - -from keras.layers import Input, Dense, Dropout -from keras.models import Model - -import time import json +import time import matplotlib as mpl +import numpy as np +import pandas as pd +from keras.layers import Dense, Dropout, Input +from keras.models import Model + mpl.use('Agg') import matplotlib.pyplot as plt EPOCH = 10 BATCH = 50 -P = 60025 # 245 x 245 -N1 = 2000 -NE = 600 # encoded dim +P = 60025 # 245 x 245 +N1 = 2000 +NE = 600 # encoded dim F_MAX = 33.3 -DR = 0.2 +DR = 0.2 class AutoEncoder(): + def __init__(self, trainFileName, testFileName, metaDataDict): self.train = None self.test = None @@ -65,7 +65,8 @@ def createEncoder(self): end = time.time() encoded_input = Input(shape=(NE,)) self.encoder = Model(input_vector, encoded) - self.decoder = Model(encoded_input, + self.decoder = Model( + encoded_input, self.ae.layers[-1](self.ae.layers[-2](encoded_input))) self.ae.compile(optimizer='rmsprop', loss='mean_squared_error') self.initTime = end - start @@ -74,8 +75,11 @@ def createEncoder(self): def trainEncoder(self): start = time.time() - self.ae.fit(self.x_train, self.x_train, batch_size=BATCH, - nb_epoch=EPOCH, validation_data=[self.x_test, self.x_test]) + self.ae.fit(self.x_train, + self.x_train, + batch_size=BATCH, + nb_epoch=EPOCH, + validation_data=[self.x_test, self.x_test]) end = time.time() self.trainTime = end - start @@ -106,17 +110,19 @@ def plotResults(self): plt.title("Histogram of Errors with 'auto' bins") plt.savefig('histogram.png') + def saveJsonResult(jsonResult, jsonFilename): f = open(jsonFilename, 'w') f.write('[\n') for i, val in enumerate(jsonResult): - if i < len(jsonResult)-1: - f.write('\t'+val+',\n') + if i < len(jsonResult) - 1: + f.write('\t' + val + ',\n') else: - f.write('\t'+val+'\n') + f.write('\t' + val + '\n') f.write(']\n') f.close() + def go(dir): # runs = 5 jsonResult = [] @@ -127,14 +133,14 @@ def go(dir): metaDataDict['benchmark-name'] = 'benchmark1' metaDataDict['type'] = 'autoencoder' # for i in range(runs): - autoencode = AutoEncoder(dir+'/breast.train.csv', - dir+'/breast.test.csv', - metaDataDict) + autoencode = AutoEncoder(dir + '/breast.train.csv', + dir + '/breast.test.csv', metaDataDict) jsonResult.append(autoencode.resultJson) print jsonResult saveJsonResult(jsonResult, 'jsonResults.json') return repr(jsonResult) # return "OK" + if __name__ == '__main__': go('.') diff --git a/archives/workflows/p1b1_hyperopt/Readme.md b/archives/workflows/p1b1_hyperopt/Readme.md index dc5b209c..357139e4 100644 --- a/archives/workflows/p1b1_hyperopt/Readme.md +++ b/archives/workflows/p1b1_hyperopt/Readme.md @@ -1,4 +1,4 @@ -# P1B1 hyperopt Workflow # +# P1B1 hyperopt Workflow The P1B1 hyperopt workflow evaluates a modified version of the P1B1 benchmark autoencoder using hyperparameters provided by a hyperopt instance. The P1B1 @@ -8,14 +8,14 @@ loss. Requirements: -* Python 2.7 -* P1B1 Autoencoder - git@github.com:ECP-CANDLE/Benchmarks.git. Clone and switch -to the supervisor branch. -* P1B1 Data - `http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/P1B1.train.csv` and `http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/P1B1.test.csv`. Download these into some suitable directory (e.g. `workflows/p1b1_hyperopt/data`) -* Hyperopt - http://hyperopt.github.io/hyperopt/ -* Keras - https://keras.io. The supervisor branch of P1B1 should work with -both version 1 and 2. -* Swift-t with Python 2.7 enabled - http://swift-lang.org/Swift-T/ +- Python 2.7 +- P1B1 Autoencoder - git@github.com:ECP-CANDLE/Benchmarks.git. Clone and switch + to the supervisor branch. +- P1B1 Data - `http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/P1B1.train.csv` and `http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/P1B1.test.csv`. Download these into some suitable directory (e.g. `workflows/p1b1_hyperopt/data`) +- Hyperopt - http://hyperopt.github.io/hyperopt/ +- Keras - https://keras.io. The supervisor branch of P1B1 should work with + both version 1 and 2. +- Swift-t with Python 2.7 enabled - http://swift-lang.org/Swift-T/ This workflow also uses code included in this repository: the EMEWS EQ/Py extension (`workflows/p1b1_hyperopt/ext/EQ-Py`) and the eqpy hyperopt bridge code @@ -31,26 +31,25 @@ p1b1_hyperopt/ swift/ ``` - * `data` - model input etc. data, such as the hyperopt space description. - * `etc` - additional code used by EMEWS - * `ext/EQ-Py` - swift-t EQ\Py extension - * `swift/workflow.swift` - the swift workflow script - * `swift/workflow.sh` - generic launch script to set the appropriate enviroment variables etc. and then launch the swift workflow script - * `swift/cori_settings.sh` - settings specific to the Cori supercomputer - * `swift/cori_workflow.sh` - launch script customized for the Cori supercomputer - * `swift/cooley_workflow.sh` - launch script customized for the Cooley supercomputer +- `data` - model input etc. data, such as the hyperopt space description. +- `etc` - additional code used by EMEWS +- `ext/EQ-Py` - swift-t EQ\Py extension +- `swift/workflow.swift` - the swift workflow script +- `swift/workflow.sh` - generic launch script to set the appropriate enviroment variables etc. and then launch the swift workflow script +- `swift/cori_settings.sh` - settings specific to the Cori supercomputer +- `swift/cori_workflow.sh` - launch script customized for the Cori supercomputer +- `swift/cooley_workflow.sh` - launch script customized for the Cooley supercomputer +## Running the Workflow - ## Running the Workflow ## +The launch scripts in the `swift` directory can be used to run the workflow. +Copy the `workflow.sh` and edit it as appropriate. The swift script takes +4 arguments, each of which is set in the launch script. - The launch scripts in the `swift` directory can be used to run the workflow. - Copy the `workflow.sh` and edit it as appropriate. The swift script takes - 4 arguments, each of which is set in the launch script. - - * EVALUATIONS - the total number of runs to perform - * PARAM_BATCH_SIZE - the number of hyperparameter sets to evaluate in parallel. Hyperopt will produce this many sets of hyperparameters each iteration until EVALUATIONS has been reached. - * SPACE_FILE - the path of the file that defines hyperopt's hyperparameter space (e.g. EMEWS_PROJECT_ROOT/data/space_description.txt) - * DATA_DIRECTORY - the directory containing the test and training data. The files themselves are assumed to be named `P1B1.train.csv` and `P1B1.test.csv` +- EVALUATIONS - the total number of runs to perform +- PARAM_BATCH_SIZE - the number of hyperparameter sets to evaluate in parallel. Hyperopt will produce this many sets of hyperparameters each iteration until EVALUATIONS has been reached. +- SPACE_FILE - the path of the file that defines hyperopt's hyperparameter space (e.g. EMEWS_PROJECT_ROOT/data/space_description.txt) +- DATA_DIRECTORY - the directory containing the test and training data. The files themselves are assumed to be named `P1B1.train.csv` and `P1B1.test.csv` The launch script also sets PYTHONPATH to include the swift-t EQ-Py extension, the eqpy hyperopt bridge, and the location of the P1B1 python code. Only the @@ -63,35 +62,34 @@ directory where X is the experiment id. A copy of the launch script that was used to launch the workflow will also be written to this directory. -### Running on Cori ### +### Running on Cori 0. You can debug on the login node with `nice swift/workflow.sh ID` 1. The Cori workflow uses Cori's existing deeplearing environment. This includes -Keras, but NOT hyperopt. To install hyperopt, if you haven't already: + Keras, but NOT hyperopt. To install hyperopt, if you haven't already: + +``` +module load deeplearning +pip install --user hyperopt +``` - ``` - module load deeplearning - pip install --user hyperopt - ``` 2. Source the `swift/cori_settings.sh` file to load the required modules etc: - ```source cori_settings``` + `source cori_settings` 3. In the swift directory, run the `cori_workflow.sh` launch script with an -experiment id. For example, - - ```./cori_workflow.sh T1``` - + experiment id. For example, +`./cori_workflow.sh T1` -### Running on Cooley ### +### Running on Cooley Cooley uses this python: `/soft/analytics/conda/env/Candle_ML/lib/python2.7/` with hyperopt, keras etc. already installed. 0. You can debug on the login node with `nice swift/workflow.sh ID` 1. Add this Swift/T to your PATH: `~wozniak/Public/sfw/x86_64/login/swift-t-conda/stc/bin` 2. In the swift directory, run the `cooley_workflow.sh` launch scrip with an -experiment id. For example, + experiment id. For example, - ```./cooley_workflow.sh T1``` +`./cooley_workflow.sh T1` diff --git a/archives/workflows/p1b1_hyperopt/data/.gitignore b/archives/workflows/p1b1_hyperopt/data/.gitignore index 6d363580..b244c341 100644 --- a/archives/workflows/p1b1_hyperopt/data/.gitignore +++ b/archives/workflows/p1b1_hyperopt/data/.gitignore @@ -1,2 +1,2 @@ P1B1.test.csv -P1B1.train.csv \ No newline at end of file +P1B1.train.csv diff --git a/archives/workflows/p1b1_hyperopt/ext/EQ-Py/EQPy.swift b/archives/workflows/p1b1_hyperopt/ext/EQ-Py/EQPy.swift index b9dcf3e1..ac2c13f5 100644 --- a/archives/workflows/p1b1_hyperopt/ext/EQ-Py/EQPy.swift +++ b/archives/workflows/p1b1_hyperopt/ext/EQ-Py/EQPy.swift @@ -11,10 +11,10 @@ pragma worktypedef resident_work; string init_package_string = """ -import eqpy -import %s -import threading -p = threading.Thread(target=%s.run) +import eqpy +import %s +import threading +p = threading.Thread(target=%s.run) p.start() """; diff --git a/archives/workflows/p1b1_hyperopt/ext/EQ-Py/eqpy.py b/archives/workflows/p1b1_hyperopt/ext/EQ-Py/eqpy.py index 1c739bb7..582c13d2 100644 --- a/archives/workflows/p1b1_hyperopt/ext/EQ-Py/eqpy.py +++ b/archives/workflows/p1b1_hyperopt/ext/EQ-Py/eqpy.py @@ -1,5 +1,5 @@ -import threading import sys +import threading try: from queue import Queue @@ -10,13 +10,14 @@ input_q = Queue() output_q = Queue() + def OUT_put(string_params): output_q.put(string_params) + def IN_get(): global input_q # print("IN_get() ...") result = input_q.get() # print("IN_get(): " + result) return result - diff --git a/archives/workflows/p1b1_hyperopt/swift/cori_settings.sh b/archives/workflows/p1b1_hyperopt/swift/cori_settings.sh index af95237f..0f8d8c85 100644 --- a/archives/workflows/p1b1_hyperopt/swift/cori_settings.sh +++ b/archives/workflows/p1b1_hyperopt/swift/cori_settings.sh @@ -4,5 +4,3 @@ module swap PrgEnv-intel PrgEnv-gnu export PATH=/global/homes/w/wozniak/Public/sfw/compute/swift-t/stc/bin:$PATH #export PATH=/global/homes/w/wozniak/Public/sfw/login/swift-t/stc/bin:$PATH - - diff --git a/archives/workflows/simple_hyperopt_example/Readme.md b/archives/workflows/simple_hyperopt_example/Readme.md index bdc7d60f..76845027 100644 --- a/archives/workflows/simple_hyperopt_example/Readme.md +++ b/archives/workflows/simple_hyperopt_example/Readme.md @@ -1,14 +1,14 @@ -# Simple Example of EMEWS Integration with hyperopt # +# Simple Example of EMEWS Integration with hyperopt This directory contains a simple example of integrating hyperopt with EMEWS. Requirements: -* Python 2.7 or 3 -* hyperopt : (http://hyperopt.github.io/hyperopt/). Install with -`pip install hyperopt` -* Swift/T with python extension +- Python 2.7 or 3 +- hyperopt : (http://hyperopt.github.io/hyperopt/). Install with + `pip install hyperopt` +- Swift/T with python extension Run the example with `swift/simple_workflow.sh`. That should properly set the PYTHONPATH, but it does assume that swift-t is in your PATH already. @@ -42,7 +42,8 @@ returned back to hyperopt via the eqpy_hyperopt package. The swift workflow in `swift/swift_run_eqpy.swift` performs the following steps: 1. Initialize the eqpy_hyperopt python with the hyperopt algorithm parameters. -These are formated as a string representation of a python dictionary. + These are formated as a string representation of a python dictionary. + ``` {'space' : %s, 'algo' : %s, @@ -50,22 +51,25 @@ These are formated as a string representation of a python dictionary. 'param_batch_size' : %d, 'seed' : %d} ``` + These are explained in the Readme for eqpy_hyperopt in this repository. 2. Request a list of parameter sets from hyperopt. The list is a ";" separated -string of python dictionaries. For example, + string of python dictionaries. For example, + ``` {'x': [-1.5477895914281512]};{'x': [1.23432434]};{'x': [0.32343]} ``` + If there were more parameters in addition to 'x', those would appear in the dictionary as well. 3. Split the list of parameters into an array and execute the model on -each element in that array in parallel. As explained above, executing the model consists -of pasting in the parameters in the python 'model' code and executing that -with a swift python call. + each element in that array in parallel. As explained above, executing the model consists + of pasting in the parameters in the python 'model' code and executing that + with a swift python call. 4. Repeat 2 and 3 until the maximum number of evaluations has been reached -(`max_evals`). + (`max_evals`). 5. Print and write out the best parameter set found by hyperopt. diff --git a/archives/workflows/simple_hyperopt_example/ext/EQ-Py/eqpy.py b/archives/workflows/simple_hyperopt_example/ext/EQ-Py/eqpy.py index a544f020..99a8debc 100644 --- a/archives/workflows/simple_hyperopt_example/ext/EQ-Py/eqpy.py +++ b/archives/workflows/simple_hyperopt_example/ext/EQ-Py/eqpy.py @@ -1,5 +1,5 @@ -import threading import sys +import threading try: from queue import Queue @@ -10,11 +10,12 @@ input_q = Queue() output_q = Queue() + def OUT_put(string_params): output_q.put(string_params) + def IN_get(): global input_q result = input_q.get() return result - diff --git a/archives/workflows/simple_hyperopt_example/swift/cori_settings.sh b/archives/workflows/simple_hyperopt_example/swift/cori_settings.sh index af95237f..0f8d8c85 100644 --- a/archives/workflows/simple_hyperopt_example/swift/cori_settings.sh +++ b/archives/workflows/simple_hyperopt_example/swift/cori_settings.sh @@ -4,5 +4,3 @@ module swap PrgEnv-intel PrgEnv-gnu export PATH=/global/homes/w/wozniak/Public/sfw/compute/swift-t/stc/bin:$PATH #export PATH=/global/homes/w/wozniak/Public/sfw/login/swift-t/stc/bin:$PATH - - diff --git a/archives/workflows/simple_hyperopt_example/swift/simple_workflow.sh b/archives/workflows/simple_hyperopt_example/swift/simple_workflow.sh index ea76b7a7..58ede262 100755 --- a/archives/workflows/simple_hyperopt_example/swift/simple_workflow.sh +++ b/archives/workflows/simple_hyperopt_example/swift/simple_workflow.sh @@ -108,4 +108,3 @@ swift-t -O0 -l -n $PROCS $MACHINE -p -I $EQPY -r $EQPY \ -e PATH=$PATH \ -e PYTHONPATH=$PYTHONPATH \ $EMEWS_PROJECT_ROOT/swift/$SWIFT_FILE $CMD_LINE_ARGS - diff --git a/archives/workflows/simple_mlrMBO_example/R/mlrMBO_utils.R b/archives/workflows/simple_mlrMBO_example/R/mlrMBO_utils.R index 5a405c6c..c88c74f6 100644 --- a/archives/workflows/simple_mlrMBO_example/R/mlrMBO_utils.R +++ b/archives/workflows/simple_mlrMBO_example/R/mlrMBO_utils.R @@ -30,4 +30,4 @@ append_extras_if_exist <- function(res_element,x){ result_with_extras_if_exist <- function(res,time_value){ lapply(res, function(x) append_extras_if_exist(c(list(y=x[1]), list(time=time_value)),x)) -} \ No newline at end of file +} diff --git a/archives/workflows/simple_mlrMBO_example/R/test/mlrMBO_utils_tests.R b/archives/workflows/simple_mlrMBO_example/R/test/mlrMBO_utils_tests.R index 7db8cb2b..a2ac7c9f 100644 --- a/archives/workflows/simple_mlrMBO_example/R/test/mlrMBO_utils_tests.R +++ b/archives/workflows/simple_mlrMBO_example/R/test/mlrMBO_utils_tests.R @@ -3,7 +3,7 @@ require(testthat) test_that("list_to_string works",{ l = list(x1 = -4.5, x2 = 6.3) - expected_string = "-4.5, 6.3" + expected_string = "-4.5, 6.3" result_string = list_to_string(l) # print(result_string) expect_equal(expected_string,result_string) @@ -13,7 +13,7 @@ test_that("elements_of_lists_to_string works",{ l1 = list(x1 = -4.5, x2 = 6.3) l2 = list(x1 = 7.6, x2 = 0.3) l3 = list(l1,l2) - expected_string = "-4.5, 6.3;7.6, 0.3" + expected_string = "-4.5, 6.3;7.6, 0.3" result_string = elements_of_lists_to_string(l3) # print(result_string) expect_equal(expected_string,result_string) @@ -21,16 +21,16 @@ test_that("elements_of_lists_to_string works",{ test_that("append_extras_if_exist works",{ x = c(1,2,3) - res_element = list(y = 1, time = 2.3) + res_element = list(y = 1, time = 2.3) new_res_element = append_extras_if_exist(res_element,x) - expected_res_element = list(y = 1, time = 2.3, user.extras = list(2,3)) + expected_res_element = list(y = 1, time = 2.3, user.extras = list(2,3)) # print(new_res_element) expect_equal(expected_res_element,new_res_element, info = "length(x) > 1") - + x = c(3) - res_element = list(y = 3, time = 2.3) + res_element = list(y = 3, time = 2.3) new_res_element = append_extras_if_exist(res_element,x) - expected_res_element = list(y = 3, time = 2.3) + expected_res_element = list(y = 3, time = 2.3) # print(new_res_element) expect_equal(expected_res_element,new_res_element, info = "length(x) == 1") }) @@ -40,24 +40,23 @@ test_that("result_with_extras_if_exist works",{ new_res = result_with_extras_if_exist(list_of_vectors,4.5) expected_res = list(list(y = 1, time = 4.5, user.extras = list(2,3)), list(y = 4, time = 4.5, user.extras = list(5,6)), - list(y = 7, time = 4.5, user.extras = list(8,9))) + list(y = 7, time = 4.5, user.extras = list(8,9))) # print(new_res_element) expect_equal(expected_res,new_res, info = "length(x) > 1, uniform") - + list_of_vectors = list(c(1,2,3),c(4,6),c(7)) new_res = result_with_extras_if_exist(list_of_vectors,4.5) expected_res = list(list(y = 1, time = 4.5, user.extras = list(2,3)), list(y = 4, time = 4.5, user.extras = list(6)), - list(y = 7, time = 4.5)) + list(y = 7, time = 4.5)) # print(new_res_element) expect_equal(expected_res,new_res, info = "length(x) mixed") - + list_of_vectors = list(c(1),c(4),c(7)) new_res = result_with_extras_if_exist(list_of_vectors,4.5) expected_res = list(list(y = 1, time = 4.5), list(y = 4, time = 4.5), - list(y = 7, time = 4.5)) + list(y = 7, time = 4.5)) # print(new_res_element) - expect_equal(expected_res,new_res, info = "length(x) == 1") + expect_equal(expected_res,new_res, info = "length(x) == 1") }) - diff --git a/archives/workflows/simple_mlrMBO_example/R/test/simple_mlrMBO_run_test.R b/archives/workflows/simple_mlrMBO_example/R/test/simple_mlrMBO_run_test.R index 109264d2..962b9b46 100644 --- a/archives/workflows/simple_mlrMBO_example/R/test/simple_mlrMBO_run_test.R +++ b/archives/workflows/simple_mlrMBO_example/R/test/simple_mlrMBO_run_test.R @@ -29,4 +29,4 @@ IN_get <- function(){ ## Assumes working directory is ../ source("simple_mlrMBO.R") -## Look at result with: readRDS("final_res.Rds") \ No newline at end of file +## Look at result with: readRDS("final_res.Rds") diff --git a/archives/workflows/simple_mlrMBO_example/R/test/test_utils.R b/archives/workflows/simple_mlrMBO_example/R/test/test_utils.R index adbeedbc..8fb4a7ef 100644 --- a/archives/workflows/simple_mlrMBO_example/R/test/test_utils.R +++ b/archives/workflows/simple_mlrMBO_example/R/test/test_utils.R @@ -1,4 +1,4 @@ -# Split the string pushed into OUT_put into +# Split the string pushed into OUT_put into # list of numerical vectors (used in simple_mlrMBO_run_test.R) split.into.param.lines <- function(x){ res1 <- unlist(strsplit(x,split = ";")) @@ -7,4 +7,4 @@ split.into.param.lines <- function(x){ make.into.q.res <- function(x){ paste0(x,collapse = ";") -} \ No newline at end of file +} diff --git a/archives/workflows/simple_mlrMBO_example/README.md b/archives/workflows/simple_mlrMBO_example/README.md index 04c20954..0a3c0afa 100644 --- a/archives/workflows/simple_mlrMBO_example/README.md +++ b/archives/workflows/simple_mlrMBO_example/README.md @@ -1,33 +1,36 @@ -# Simple Example of EMEWS Integration with mlrMBO # +# Simple Example of EMEWS Integration with mlrMBO This directory contains a simple example of integrating mlrMBO with EMEWS. Requirements: -* R 3.2+ -* All required R packages can be installed with -`install.packages("")` - * mlrMBO and dependencies : (https://mlr-org.github.io/mlrMBO/). - * parallelMap : (https://cran.r-project.org/web/packages/parallelMap/index.html) - * DiceKriging and dependencies : (https://cran.r-project.org/web/packages/DiceKriging/index.html) - * rgenoud : (https://cran.r-project.org/web/packages/rgenoud/index.html) - * testthat (for testing) : (https://cran.r-project.org/web/packages/testthat/index.html) -* Swift/T with R extension -* Compiled EQ/R, instructions in `ext/EQ-R/eqr/COMPILING.txt` +- R 3.2+ +- All required R packages can be installed with + `install.packages("")` + - mlrMBO and dependencies : (https://mlr-org.github.io/mlrMBO/). + - parallelMap : (https://cran.r-project.org/web/packages/parallelMap/index.html) + - DiceKriging and dependencies : (https://cran.r-project.org/web/packages/DiceKriging/index.html) + - rgenoud : (https://cran.r-project.org/web/packages/rgenoud/index.html) + - testthat (for testing) : (https://cran.r-project.org/web/packages/testthat/index.html) +- Swift/T with R extension +- Compiled EQ/R, instructions in `ext/EQ-R/eqr/COMPILING.txt` Run the example with `./swift_run_eqr.sh `. That assume that swift-t is in your PATH already. ## Workflow details + The workflow attempts to minimize the example function `sum(x^2)` for a two dimensional space `(x1,x2)` defined by the variables: + ```R "x1": lower = -5, upper = 5 "x2": lower = -10, upper = 20 ``` and using existing capabilities from mlrMBO: -* **expected improvement** for the infill criterion -* **constant liar** for multi-point proposals + +- **expected improvement** for the infill criterion +- **constant liar** for multi-point proposals The example uses **multi-point proposals** for concurrency in the iterative steps, defined via a `pp=` argument within the `swift/swift_run_eqr.sh` script. Maximum algorithm iteration is defined via a `it=` argument, also within the `swift/swift_run_eqr.sh` script. @@ -36,15 +39,16 @@ The mlrMBO algorithm is defined in `R/simple_mlrMBO.R` and it controls the overa As indicated above, the workflow is run with `./swift_run_eqr.sh `. When the workflow completes, the results from running `mbo` are saved to the experiment directory in `experiments/experiment_ID/final_res.Rds` and can be loaded within an R session using `readRDS("/final_res.Rds")`. ## Testing the R components + The `R/test` directory contains tests for the R components in the workflow and for running the mlrMBO algorithm without Swift/T. -* `mlrMBO_utils_tests.R`: unit tests for `R/mlrMBO_utils.R`, which provides R components to the workflow (run using the testthat library's `test_file("/mlrMBO_utils_tests.R")` function) -* `simple_mlrMBO_run_test.R`: script that provides R implementations for the EQ/R `OUT_put` and `IN_get` calls to be able to run `R/simple_mlrMBO.R` at smaller scales for testing without Swift/T (run from the `R` directory via `source("test/simple_mlrMBO_run_test.R")`) -* `test_utils_tests.R`: tests for functions in `R/test/test_utils.R` which are used to make `simple_mlrMBO_run_test.R` work (run using `test_file("/test_utils_tests.R")`) -*(Below is the information that was generated when the simple_mlrMBO_example EMEWS project was created.)* +- `mlrMBO_utils_tests.R`: unit tests for `R/mlrMBO_utils.R`, which provides R components to the workflow (run using the testthat library's `test_file("/mlrMBO_utils_tests.R")` function) +- `simple_mlrMBO_run_test.R`: script that provides R implementations for the EQ/R `OUT_put` and `IN_get` calls to be able to run `R/simple_mlrMBO.R` at smaller scales for testing without Swift/T (run from the `R` directory via `source("test/simple_mlrMBO_run_test.R")`) +- `test_utils_tests.R`: tests for functions in `R/test/test_utils.R` which are used to make `simple_mlrMBO_run_test.R` work (run using `test_file("/test_utils_tests.R")`) + +_(Below is the information that was generated when the simple_mlrMBO_example EMEWS project was created.)_ -EMEWS project template ------------------------ +## EMEWS project template You have just created an EMEWS project. The project consists of the following directories: @@ -62,18 +66,19 @@ simple_mlrMBO_example/ swift/ README.md ``` + The directories are intended to contain the following: - * `data` - model input etc. data - * `etc` - additional code used by EMEWS - * `ext` - swift-t extensions such as eqpy, eqr - * `python` - python code (e.g. model exploration algorithms written in python) - * `python/test` - tests of the python code - * `R` - R code (e.g. model exploration algorithms written R) - * `R/test` - tests of the R code - * `scripts` - any necessary scripts (e.g. scripts to launch a model), excluding - scripts used to run the workflow. - * `swift` - swift code +- `data` - model input etc. data +- `etc` - additional code used by EMEWS +- `ext` - swift-t extensions such as eqpy, eqr +- `python` - python code (e.g. model exploration algorithms written in python) +- `python/test` - tests of the python code +- `R` - R code (e.g. model exploration algorithms written R) +- `R/test` - tests of the R code +- `scripts` - any necessary scripts (e.g. scripts to launch a model), excluding + scripts used to run the workflow. +- `swift` - swift code Use the subtemplates to customize this structure for particular types of workflows. These are: sweep, eqpy, and eqr. diff --git a/archives/workflows/simple_mlrMBO_example/ext/EQ-R/eqr/BlockingQueue.h b/archives/workflows/simple_mlrMBO_example/ext/EQ-R/eqr/BlockingQueue.h index c9dfd41c..a9f983da 100644 --- a/archives/workflows/simple_mlrMBO_example/ext/EQ-R/eqr/BlockingQueue.h +++ b/archives/workflows/simple_mlrMBO_example/ext/EQ-R/eqr/BlockingQueue.h @@ -24,7 +24,7 @@ class BlockingQueue { } this->d_condition.notify_one(); } - + T pop() { std::unique_lock lock(this->d_mutex); // [ capture-list ] ( params ) { body } diff --git a/archives/workflows/simple_mlrMBO_example/ext/EQ-R/eqr/settings.mk b/archives/workflows/simple_mlrMBO_example/ext/EQ-R/eqr/settings.mk index c8dacae6..7906db3c 100644 --- a/archives/workflows/simple_mlrMBO_example/ext/EQ-R/eqr/settings.mk +++ b/archives/workflows/simple_mlrMBO_example/ext/EQ-R/eqr/settings.mk @@ -1,6 +1,6 @@ -CXXFLAGS = -g -O0 -fPIC -std=c++0x -I/usr/local/include -I/Library/Frameworks/R.framework/Versions/3.3/Resources/include -I/Users/jozik/Library/R/3.3/library/Rcpp/include -I/Users/jozik/Library/R/3.3/library/RInside/include -CPPFLAGS = -I/usr/local/include -I/Library/Frameworks/R.framework/Versions/3.3/Resources/include -I/Users/jozik/Library/R/3.3/library/Rcpp/include -I/Users/jozik/Library/R/3.3/library/RInside/include +CXXFLAGS = -g -O0 -fPIC -std=c++0x -I/usr/local/include -I/Library/Frameworks/R.framework/Versions/3.3/Resources/include -I/Users/jozik/Library/R/3.3/library/Rcpp/include -I/Users/jozik/Library/R/3.3/library/RInside/include +CPPFLAGS = -I/usr/local/include -I/Library/Frameworks/R.framework/Versions/3.3/Resources/include -I/Users/jozik/Library/R/3.3/library/Rcpp/include -I/Users/jozik/Library/R/3.3/library/RInside/include LDFLAGS = -L/Users/jozik/Library/R/3.3/library/RInside/lib -lRInside -L/Library/Frameworks/R.framework/Versions/3.3/Resources/lib -lR -L/usr/local/lib -ltcl8.6 -Wl,-rpath -Wl,/usr/local/lib -Wl,-rpath -Wl,/Library/Frameworks/R.framework/Versions/3.3/Resources/lib -Wl,-rpath -Wl,/Users/jozik/Library/R/3.3/library/RInside/lib TCL_VERSION = 8.6 diff --git a/archives/workflows/simple_uq/python/permute.py b/archives/workflows/simple_uq/python/permute.py index 48fa530f..f47507e8 100644 --- a/archives/workflows/simple_uq/python/permute.py +++ b/archives/workflows/simple_uq/python/permute.py @@ -1,22 +1,25 @@ - from random import randint + class State: seed = None size = None training = None + state = State() + def configure(seed, size, training): global state state.seed = seed state.size = size state.training = training print("permute: configure(seed=%i, size=%i, training=%i)" % - (seed, size, training)) + (seed, size, training)) return "OK" + def get(): global state result = [] @@ -27,25 +30,27 @@ def get(): n = state.training for i in range(0, state.training): # print(pool) - i = randint(0,n+1) + i = randint(0, n + 1) v = pool[i] result.append(v) del pool[i] - n = n-1 + n = n - 1 return result + def validation(size, training): - """ Obtain the validation set corresponding to the given training set """ + """Obtain the validation set corresponding to the given training set.""" result = [] for i in range(0, size): if i not in training: result.append(i) return result + def get_tv(): - """ Get training and validation """ + """Get training and validation.""" global state t = get() v = validation(state.size, t) # return str([t, v]) - return t,v + return t, v diff --git a/archives/workflows/simple_uq/python/test-permute.py b/archives/workflows/simple_uq/python/test-permute.py index c36d7d5d..8413de3a 100644 --- a/archives/workflows/simple_uq/python/test-permute.py +++ b/archives/workflows/simple_uq/python/test-permute.py @@ -1,11 +1,10 @@ - import permute size = 10 validation = 2 -permute.configure(seed=10101, size=size, training=size-validation) +permute.configure(seed=10101, size=size, training=size - validation) -for i in range(0,9): - training = permute.get() - validation = permute.validation(size, training) - print str(training) + " " + str(validation) +for i in range(0, 9): + training = permute.get() + validation = permute.validation(size, training) + print str(training) + " " + str(validation) diff --git a/archives/workflows/simple_uq/swift/junk.py b/archives/workflows/simple_uq/swift/junk.py index f7fc57d0..5287911d 100644 --- a/archives/workflows/simple_uq/swift/junk.py +++ b/archives/workflows/simple_uq/swift/junk.py @@ -1,3 +1,2 @@ - inputs = eval(permutation_sets) training, validation = inputs diff --git a/archives/workflows/simple_uq/swift/obj_func.py b/archives/workflows/simple_uq/swift/obj_func.py index bbbc3bcd..49a355f1 100644 --- a/archives/workflows/simple_uq/swift/obj_func.py +++ b/archives/workflows/simple_uq/swift/obj_func.py @@ -1,4 +1,3 @@ - # OBJ FUNC PY import os @@ -18,22 +17,24 @@ size = 10 validation = 2 -permute.configure(seed=int(index)+10101, size=size, training=size-validation) +permute.configure(seed=int(index) + 10101, + size=size, + training=size - validation) training, validation = permute.get_tv() log = directory + "/" + "run.log" with open(log, "w") as fp: - fp.write("training: " + str(training) + "\n") + fp.write("training: " + str(training) + "\n") fp.write("validation: " + str(validation) + "\n\n") # Funny function result = float(0.0) -multiplier = float(10*10*10) -for i in range(0,5): - result = result + training[i]*multiplier +multiplier = float(10 * 10 * 10) +for i in range(0, 5): + result = result + training[i] * multiplier multiplier /= 10 with open(output, "w") as fp: # fp.write("training: " + str(training) + "\n") - fp.write(str(result)+"\n") + fp.write(str(result) + "\n") diff --git a/docs/format.css b/docs/format.css index c1223e5d..596beb36 100644 --- a/docs/format.css +++ b/docs/format.css @@ -1,4 +1,3 @@ - /* Asciidoc customizations */ a:visited { diff --git a/docs/home.html b/docs/home.html index 868defef..0b551b14 100644 --- a/docs/home.html +++ b/docs/home.html @@ -1,850 +1,1115 @@ - + - - - -CANDLE Supervisor Home Page - - - - - -
-
-
-

This is the main home page about CANDLE Supervisor effort with links to workflows and other supporting information.

-
-
-
-

Workflows

-
-

The workflows are currently indexed in the README visible here.

-
-
-
-

Database integration

-
-

The database work is described in the README visible here.

-
-
-
-

Swift installations

-
-
-

Theta

-

This is linked to Python and R but currently without ML libs.

-

Other Theta ESP notes are here: https://collab.cels.anl.gov/display/ESP

-
-

Python

-

Installed in:

-
-
-
/projects/Candle_ECP/swift/deps/Python-2.7.12
-
-

To run this installation, you must set:

-
-
-
$ export LD_LIBRARY_PATH=/projects/Candle_ECP/swift/deps/Python-2.7.12/lib
-
-
    -
  • -

    -Cori -

    -

    This uses the system-installed Python with ML libs in module:
    -tensorflow/intel-head

    -
  • -
  • -

    -Titan -

    -

    This is a CANDLE-only installation. It uses the OLCF-provided Python deeplearning module (Python 3.6 plus TensorFlow, Theano, and Keras) and R 3.3.2 .

    -

    Add to PATH: /lustre/atlas2/csc249/proj-shared/sfw/swift-t/stc/bin

    -

    Run with:

    -
    -
    -
    $ export TITAN=true
    +
    +      a:visited {
    +        color: gray;
    +      }
    +      h5 {
    +        font-size: 0.8em;
    +      }
    +    
    +    
    +  
    +  
    +    
    +    
    +
    +
    +
    +

    + This is the main home page about CANDLE Supervisor effort with + links to workflows and other supporting information. +

    +
    +
    +
    +
    +

    Workflows

    +
    +
    +

    + The workflows are currently indexed in the README visible + here. +

    +
    +
    +
    +
    +

    Database integration

    +
    +
    +

    + The database work is described in the README visible + here. +

    +
    +
    +
    +
    +

    Swift installations

    +
    +
    +

    + Theta +

    +
    +

    + This is linked to Python and R but currently without ML libs. +

    +
    +
    +

    + Other Theta ESP notes are here: + https://collab.cels.anl.gov/display/ESP +

    +
    +
    +

    Python

    +

    Installed in:

    +
    +
    +
    /projects/Candle_ECP/swift/deps/Python-2.7.12
    +
    +
    +
    +

    To run this installation, you must set:

    +
    +
    +
    +
    $ export LD_LIBRARY_PATH=/projects/Candle_ECP/swift/deps/Python-2.7.12/lib
    +
    +
    +
    +
      +
    • +

      + Cori +

      +
      +

      + This uses the system-installed Python with ML libs in + module:
      + tensorflow/intel-head +

      +
      +
    • +
    • +

      + Titan +

      +
      +

      + This is a CANDLE-only installation. It uses the + OLCF-provided Python deeplearning module + (Python 3.6 plus TensorFlow, Theano, and Keras) and R + 3.3.2 . +

      +
      +
      +

      + Add to PATH: + /lustre/atlas2/csc249/proj-shared/sfw/swift-t/stc/bin +

      +
      +

      Run with:

      +
      +
      +
      $ export TITAN=true
       $ export PROJECT=... QUEUE=...
       $ export LD_LIBRARY_PATH=/sw/xk6/deeplearning/1.0/sles11.3_gnu4.9.3/lib:/sw/xk6/deeplearning/1.0/sles11.3_gnu4.9.3/cuda/lib64:/opt/gcc/4.9.3/snos/lib64:/sw/xk6/r/3.3.2/sles11.3_gnu4.9.3x/lib64/R/lib
       $ swift-t -m cray -e LD_LIBRARY_PATH=$LD_LIBRARY_PATH workflow.swift
      -
      -
    • -
    -

    On Titan, do module load autoconf to get Autoconf 2.69 .

    -
      -
    • -

      -Cooley -

      -

      This uses the system-installed Python with ML libs at:
      -/soft/analytics/conda/env/Candle_ML

      -
    • -
    • -

      -JLSE KNL -

      -

      This does not yet have Python.

      -
    • -
    • -

      -JLSE Prasanna
      -This uses a VirtualEnv Python at /home/pbalapra/.virtualenvs -

      -
        -
      • -

        -~wozniak/Public/sfw/icc/swift-t-pb/stc/bin -

        -
      • -
      -
    • -
    -
    -
    -
    -
    -
    -

    - - +
    +
    +
  • +
+
+
+

+ On Titan, do module load autoconf to get Autoconf + 2.69 . +

+
+
+
    +
  • +

    + Cooley +

    +
    +

    + This uses the system-installed Python with ML libs + at:
    + /soft/analytics/conda/env/Candle_ML +

    +
    +
  • +
  • +

    + JLSE KNL +

    +
    +

    This does not yet have Python.

    +
    +
  • +
  • +

    + JLSE Prasanna
    + This uses a VirtualEnv Python at + /home/pbalapra/.virtualenvs +

    +
    +
      +
    • +

      + ~wozniak/Public/sfw/icc/swift-t-pb/stc/bin +

      +
    • +
    +
    +
  • +
+
+
+
+
+
+
+

+ + diff --git a/docs/summit.txt b/docs/summit.txt index dd2ce52b..46285aad 100644 --- a/docs/summit.txt +++ b/docs/summit.txt @@ -17,7 +17,7 @@ $ make install-binaries install-libraries install-headers == Install Python -$ ./configure +$ ./configure --prefix=/gpfs/alpine/world-shared/med106/sw/gcc-7.4.0/Python-3.5.1 --enable-shared $ nice make -j diff --git a/docs/user_guide.adoc b/docs/user_guide.adoc index e197d27c..c54e3ae8 100644 --- a/docs/user_guide.adoc +++ b/docs/user_guide.adoc @@ -203,7 +203,7 @@ $ export WALLTIME=00:10:00 $ ./test/upf-1.sh theta -// or +// or $ QUEUE=debug-cache-quad PROJECT=myproject PROCS=3 WALLTIME=00:10:00 ./test/upf-1.sh theta ---- @@ -252,7 +252,7 @@ result: 2.10822688904 <1> `output.txt` contains stdout and stderr of this experiment. This is helpful to debug errors. <2> `run` directory contains the output files. You will see two directories that are corresponding the IDs configured in upf-1.txt <3> a copy of configuration files are available so that you can trace what were passed to this experiment. -<4> stdout of test0. After 10 epoches, validation loss was 2.1082. +<4> stdout of test0. After 10 epoches, validation loss was 2.1082. == Running mlrMBO based Hyperparameters Optimization (HPO) on Theta @@ -315,4 +315,3 @@ You can specify the HPO search strategy. As you can see in `test/cfg-prm-1.sh`, * `MAX_ITERATIONS` is a number of iterations. * `PROPOSE_POINTS` is a number of parameter sets that CANDLE will evaluate in each iteration. So, if `MAX_ITERATION=3` and `PROPOSE_POINTS=5`, CANDLE will be ended up evaluating 25 params (10 + 3 x 5). * `MAX_BUDGET` should be greater than total evaluations. In this example, 45. - diff --git a/docs/user_guide.html b/docs/user_guide.html index e7da7374..ee657c0f 100644 --- a/docs/user_guide.html +++ b/docs/user_guide.html @@ -1,1012 +1,1394 @@ - - - - - - -CANDLE Library User Guide - - - - - -
-
-
-

The CANDLE library provides a wrapper class and utility functions, which enable users run their own deep learning code in high performance computers that CANDLE supports. With the current version of CANDLE library, users should be able to run hyperparameter optimization (mlrMBO workflow) or parallel excution (upf workflow). Due to the design of both workflows, users are required to implement certain methods (will be explained in section 1) and modify several config files (section 2). This user guide will provide an overview of structure and explanation of parameters or varaiables as needed.

-
-
-

How to write CANDLE compliant deep learning code

-
-

Minimum requirements

-
-

The CANDLE requires two methods, initialize_parameters() and run().

-
-

Initialize_parameters Method

-

In initialize_parameters method, we will construct a class and build a parameter set, which will be used inside your deep learning code (run method). We provides some common parameters such as batch_size, epochs, etc. In addition to that, you can construct your own parameters (see Aurgument Specification section below). Finally, the initialize_parameters should return a python dictionary, in this doc, will be called gParameters (global parameters).

-
-
-

Run Method

-

You can place your deep learning code in run(Dict) method. You can use parameter varaiable like gParameters['batch_size'].

-

We have an example, that converted a simple MNIST neural net mnist_mlp.py provided by Keras Team into CANDLE compliant form. In this example, you will see how the initialize_parameters method is implemented and how the actual NN code was transplanted in run method.

-

Finally, the run() returns history. This can be omitted for upf workflow, but required for HPO workflow.

-
-
-
    -
  1. -

    -In next section, we will explain where the common.MNIST class came from. -

    -
  2. -
  3. -

    -initialize_parameters return dictionary -

    -
  4. -
  5. -

    -run method receives parameter dictionary -

    -
  6. -
  7. -

    -returns history object -

    -
  8. -
-
-
-
-
-

Argument Sepcification

-
-

In order to take advantage of the CANDLE framework, a model needs to be able to modify its parameters via either reading from the default_model file, or overwriting those parameters via an appropriate command line argument. We standadized frequently used ML keywords, as well as certain other keywords which are used by the CANDLE scripts. We recommend users aware of these arguemtns to avoid conflicts. For these CANDLE built-in command line arguments, please see default_utils.py

-
-

Adding keyword

-

In order to simplify the process of adding keywords, we require the user to provide a list of metadata of how to parse the arugment.

-
-
-
[{
-  'name':'shared_nnet_spec', // <1>
-  'nargs':'+', // <2>
-  'type': int, // <3>
-  'help':'network structure of shared layer' // <4>
-}, ...]
-
-
    -
  1. -

    -required. Name of parameter. -

    -
  2. -
  3. -

    -optional. The number of command-line arguments. -

    -
  4. -
  5. -

    -required. The type to which the command-line arguments should be converted. -

    -
  6. -
  7. -

    -optional. A brief description of what the argument does. -you can add default, choices, and action as needed. -

    -
  8. -
-
-
-

Building Class

-

When you have a list of additional paramaters, you need to pass the definition to be parsed. Even though you don’t have any additional parameters, this is generally recommended, since you can buid your own shared method and build data processing code that will be shared.

-

Please take a look this example. This is a source of common.MNIST class definition.

-
-
-
additional_definitions = None
-required = None
-
-class MNIST(default_utils.Benchmark):
-    def set_locals(self):
-        if required is not None:
-            self.required = set(required)
-        if additional_definitions is not None:
-            self.additional_definitions = additional_definitions
-
-
-
-

Thead Optimization

-

Some HPC machines like Theta, the performance will greatly improved if we let CANDLE handles threads. So, it is generally recommended to have code like line 14 to 21 in this example

-
-
-
-

How to run CANDLE compliant code in Theta

-

As mentioned above, we offer two different workflows in CANDLE: -Unrolled Parameter File (UPF) and Hyper Parameter Optimization (HPO). -The UPF workflow allows you to run parallel multi-node executions with different parameters, -while HPO workflow evaluates the best value of hyperparameters based on mlrMBO algorithm.

-
-

Running UPF on Theta

-
-

Step 1. Checkout Supervisor repo

-
-
-
$ git clone https://github.com/ECP-CANDLE/Supervisor.git
-
-

Step 2. Move to upf workflow directory

-
-
-
$ cd Supervisor/workflow/upf
-
-

Step 3. Set Env variables. In test/cfg-sys-1.sh, -you will need to set BENCHMARK_DIR to point the directory that your script locates, and -MODEL_PYTHON_SCRIPT to name the script you want to run

-
-
-
BENCHMARK_DIR=directory_where_my_script_locates
-MODEL_PYTHON_SCRIPT=my_script
-
-

Step 4. Set execution plan. Check test/upf-1.txt for parameter configuration and modify as needed. -This file contains multiple number of JSON documents. Each JSON document will contain the command line parameters. -For example,

-
-
-
{"id": "test0", "epochs": 10}
-{"id": "test1", "epochs": 20}
-
-

This will invoke two instances, which will run 10 epochs and 20 epochs respectively.

-

Step 5. Submit your job. You will need to set QUEUE, PROJECT, PROCS, and WALLTIME. -You can configure those in cfg-sys-1.sh (see Step 3), set as env variables, or you can provide in your command line (see below).

-
-
-
$ export QUEUE=default
-$ export PROJECT=myproject
-$ export PROCS=3
-$ export WALLTIME=01:00:00
-
-$ ./test/upf-1.sh theta upf-1.txt
-
-// or
-
-$ QUEUE=default PROJECT=myproject PROCS=3 WALLTIME=01:00:00 ./test/upf-1.sh theta upf-1.txt
-
-
-

Step 6. Check queue status

-
-
-
$ qstat -h user_name -f
-
-
-
-
-

Running mlrMBO based Hyperparameters Optimization (HPO) on Theta

-
-

Step 1. Checkout Supervisor repo

-
-
-
$ git clone https://github.com/ECP-CANDLE/Supervisor.git
-
-

Step 2. Move to mlrMBO workflow directory

-
-
-
$ cd Supervisor/workflow/mlrMBO
-
-

Step 3. Set Env variables. In test/cfg-sys-1.sh, -you will need to set BENCHMARK_DIR to point the directory that your script locates, and -MODEL_PYTHON_SCRIPT to name the script you want to run

-
-
-
BENCHMARK_DIR=directory_where_my_script_locates
-MODEL_PYTHON_SCRIPT=my_script
-
-

Step 4. Config hyper parameters. In this step, we are configuring parameter sets, which we will iteratively evaluate. -For example, you can create workflow/data/mnist.R as below.

-
-
-
param.set <- makeParamSet(
-  makeDiscreteParam("batch_size", values=c(32, 64, 128, 256, 512)),
-  makeDiscreteParam("activation", values=c("relu", "sigmoid", "tanh")),
-  makeDiscreteParam("optimizer", values=c("adam", "sgd", "rmsprop")),
-  makeIntegerParam("epochs", lower=20, upper=20)
-)
-
-

In this example, we are varying four paramters, batch_size, activation, optimizer, epochs. -Entire parameter space will be 5 x 3 x 3 x 1.

-

After creating this file, we need to point this file.

-
-
-
$ export PARAM_SET_FILE=mnist.R
-
-

Step 5. Submit your job.

-
-
-
$ ./test/test-1.sh mnist theta
-
-

The first argument is MODEL_NAME. If the name is registered in test/cfg-prm-1.sh, it will use the pre-configured parameter file. -Otherwise, CANDLE will use PARAM_SET_FILE we configured in step 4.

-

You can specify the HPO search strategy. As you can see in test/cfg-prm-1.sh, you are able to config PROPOSE_POINTS, MAX_CONCURRENT_EVALUATIONS, MAX_ITERATIONS, MAX_BUDGE, DESIGN_SIZE.

-
    -
  • -

    -DESIGN_SIZE is a number of param sets that will evaluate at the beginning of HPO search. In this example, CANDLE will select random 10 param sets out of 45 (see Step 4, for break downs). -

    -
  • -
  • -

    -MAX_ITERATIONS is a number of iteration. -

    -
  • -
  • -

    -PROPOSE_POINTS is a number of param sets that CANDLE will evaluate in each iteration. So, if MAX_ITERATION=3 and PROPOSE_POINTS=5, CANDLE will ended up evaluating 25 params (3 x 5 + 10). -

    -
  • -
  • -

    -MAX_BUDGET should be greater than total evaluations. In this example, 45. -

    -
  • -
-
-
-
-

- - - + + + + + + + CANDLE Library User Guide + + + + + +
+
+
+
+

+ The CANDLE library provides a wrapper class and utility functions, + which enable users run their own deep learning code in high + performance computers that CANDLE supports. With the current + version of CANDLE library, users should be able to run + hyperparameter optimization (mlrMBO workflow) or parallel excution + (upf workflow). Due to the design of both workflows, users are + required to implement certain methods (will be explained in + section 1) and modify several config files (section 2). This user + guide will provide an overview of structure and explanation of + parameters or varaiables as needed. +

+
+
+
+

+ How to write CANDLE compliant deep learning code +

+
+

Minimum requirements

+
+
+

+ The CANDLE requires two methods, + initialize_parameters() and run(). +

+
+
+

+ Initialize_parameters Method +

+
+

+ In initialize_parameters method, we will construct + a class and build a parameter set, which will be used inside + your deep learning code (run method). We provides some common + parameters such as batch_size, epochs, + etc. In addition to that, you can construct your own parameters + (see Aurgument Specification section below). Finally, the + initialize_parameters should return a python + dictionary, in this doc, will be called + gParameters (global parameters). +

+
+
+
+

Run Method

+
+

+ You can place your deep learning code in + run(Dict) method. You can use parameter varaiable + like gParameters['batch_size']. +

+
+
+

+ We have an + example, that converted a simple MNIST neural net + mnist_mlp.py provided by + Keras Team + into CANDLE compliant form. In this example, you will see how + the initialize_parameters method is implemented and + how the actual NN code was transplanted in + run method. +

+
+
+

+ Finally, the run() returns history. This can be + omitted for upf workflow, but required for HPO workflow. +

+
+
+
+
+
+
    +
  1. +

    + In next section, we will explain where the common.MNIST + class came from. +

    +
  2. +
  3. +

    initialize_parameters return dictionary

    +
  4. +
  5. +

    run method receives parameter dictionary

    +
  6. +
  7. +

    returns history object

    +
  8. +
+
+
+
+
+
+

Argument Sepcification

+
+
+

+ In order to take advantage of the CANDLE framework, a model needs + to be able to modify its parameters via either reading from the + default_model file, or overwriting those parameters via an + appropriate command line argument. We standadized frequently used + ML keywords, as well as certain other keywords which are used by + the CANDLE scripts. We recommend users aware of these arguemtns to + avoid conflicts. For these CANDLE built-in command line arguments, + please see + default_utils.py +

+
+
+

Adding keyword

+
+

+ In order to simplify the process of adding keywords, we require + the user to provide a list of metadata of how to parse the + arugment. +

+
+
+
+
[{
+  'name':'shared_nnet_spec', // <1>
+  'nargs':'+', // <2>
+  'type': int, // <3>
+  'help':'network structure of shared layer' // <4>
+}, ...]
+
+
+
+
    +
  1. +

    required. Name of parameter.

    +
  2. +
  3. +

    optional. The number of command-line arguments.

    +
  4. +
  5. +

    + required. The type to which the command-line arguments + should be converted. +

    +
  6. +
  7. +

    + optional. A brief description of what the argument does. you + can add default, choices, and + action as needed. +

    +
  8. +
+
+
+
+

Building Class

+
+

+ When you have a list of additional paramaters, you need to pass + the definition to be parsed. Even though you don’t have + any additional parameters, this is generally recommended, since + you can buid your own shared method and build data processing + code that will be shared. +

+
+
+

+ Please take a look this + example. This is a source of common.MNIST class definition. +

+
+
+
+
additional_definitions = None
+required = None
+
+class MNIST(default_utils.Benchmark):
+    def set_locals(self):
+        if required is not None:
+            self.required = set(required)
+        if additional_definitions is not None:
+            self.additional_definitions = additional_definitions
+
+
+
+
+

Thead Optimization

+
+

+ Some HPC machines like Theta, the performance will + greatly improved if we let CANDLE handles threads. So, it is + generally recommended to have code like line 14 to 21 in + this example +

+
+
+
+
+

+ How to run CANDLE compliant code in Theta +

+
+

+ As mentioned above, we offer two different workflows in CANDLE: + Unrolled Parameter File (UPF) and Hyper Parameter Optimization (HPO). + The UPF workflow allows you to run parallel multi-node executions with + different parameters, while HPO workflow evaluates the best value of + hyperparameters based on mlrMBO algorithm. +

+
+
+

Running UPF on Theta

+
+

Step 1. Checkout Supervisor repo

+
+
+
$ git clone https://github.com/ECP-CANDLE/Supervisor.git
+
+
+
+

Step 2. Move to upf workflow directory

+
+
+
+
$ cd Supervisor/workflow/upf
+
+
+
+

+ Step 3. Set Env variables. In test/cfg-sys-1.sh, you + will need to set BENCHMARK_DIR to point the directory + that your script locates, and MODEL_PYTHON_SCRIPT to + name the script you want to run +

+
+
+
+
BENCHMARK_DIR=directory_where_my_script_locates
+MODEL_PYTHON_SCRIPT=my_script
+
+
+
+

+ Step 4. Set execution plan. Check test/upf-1.txt for + parameter configuration and modify as needed. This file contains + multiple number of JSON documents. Each JSON document will contain + the command line parameters. For example, +

+
+
+
+
{"id": "test0", "epochs": 10}
+{"id": "test1", "epochs": 20}
+
+
+
+

+ This will invoke two instances, which will run 10 epochs and 20 + epochs respectively. +

+
+
+

+ Step 5. Submit your job. You will need to set QUEUE, + PROJECT, PROCS, and + WALLTIME. You can configure those in + cfg-sys-1.sh (see Step 3), set as env variables, or + you can provide in your command line (see below). +

+
+
+
+
$ export QUEUE=default
+$ export PROJECT=myproject
+$ export PROCS=3
+$ export WALLTIME=01:00:00
+
+$ ./test/upf-1.sh theta upf-1.txt
+
+// or
+
+$ QUEUE=default PROJECT=myproject PROCS=3 WALLTIME=01:00:00 ./test/upf-1.sh theta upf-1.txt
+
+
+
+ +
+

Step 6. Check queue status

+
+
+
$ qstat -h user_name -f
+
+
+
+
+
+

+ Running mlrMBO based Hyperparameters Optimization (HPO) on Theta +

+
+

Step 1. Checkout Supervisor repo

+
+
+
$ git clone https://github.com/ECP-CANDLE/Supervisor.git
+
+
+
+

Step 2. Move to mlrMBO workflow directory

+
+
+
+
$ cd Supervisor/workflow/mlrMBO
+
+
+
+

+ Step 3. Set Env variables. In test/cfg-sys-1.sh, you + will need to set BENCHMARK_DIR to point the directory + that your script locates, and MODEL_PYTHON_SCRIPT to + name the script you want to run +

+
+
+
+
BENCHMARK_DIR=directory_where_my_script_locates
+MODEL_PYTHON_SCRIPT=my_script
+
+
+
+

+ Step 4. Config hyper parameters. In this step, we are configuring + parameter sets, which we will iteratively evaluate. For example, + you can create workflow/data/mnist.R as below. +

+
+
+
+
param.set <- makeParamSet(
+  makeDiscreteParam("batch_size", values=c(32, 64, 128, 256, 512)),
+  makeDiscreteParam("activation", values=c("relu", "sigmoid", "tanh")),
+  makeDiscreteParam("optimizer", values=c("adam", "sgd", "rmsprop")),
+  makeIntegerParam("epochs", lower=20, upper=20)
+)
+
+
+
+

+ In this example, we are varying four paramters, + batch_size, activation, + optimizer, epochs. Entire parameter + space will be 5 x 3 x 3 x 1. +

+
+
+

After creating this file, we need to point this file.

+
+
+
+
$ export PARAM_SET_FILE=mnist.R
+
+
+

Step 5. Submit your job.

+
+
+
$ ./test/test-1.sh mnist theta
+
+
+
+

+ The first argument is MODEL_NAME. If the name is registered in + test/cfg-prm-1.sh, it will use the pre-configured + parameter file. Otherwise, CANDLE will use + PARAM_SET_FILE we configured in step 4. +

+
+
+

+ You can specify the HPO search strategy. As you can see in + test/cfg-prm-1.sh, you are able to config + PROPOSE_POINTS, + MAX_CONCURRENT_EVALUATIONS, + MAX_ITERATIONS, MAX_BUDGE, + DESIGN_SIZE. +

+
+
+
    +
  • +

    + DESIGN_SIZE is a number of param sets that will + evaluate at the beginning of HPO search. In this example, + CANDLE will select random 10 param sets out of 45 (see Step 4, + for break downs). +

    +
  • +
  • +

    MAX_ITERATIONS is a number of iteration.

    +
  • +
  • +

    + PROPOSE_POINTS is a number of param sets that + CANDLE will evaluate in each iteration. So, if + MAX_ITERATION=3 and + PROPOSE_POINTS=5, CANDLE will ended up evaluating + 25 params (3 x 5 + 10). +

    +
  • +
  • +

    + MAX_BUDGET should be greater than total + evaluations. In this example, 45. +

    +
  • +
+
+
+
+
+

+ + + diff --git a/models/Comparator/cmp_baseline_keras2.py b/models/Comparator/cmp_baseline_keras2.py new file mode 100644 index 00000000..8fc4fead --- /dev/null +++ b/models/Comparator/cmp_baseline_keras2.py @@ -0,0 +1,103 @@ +import os +import subprocess +from pathlib import Path +import candle + + +class Comparator(candle.Benchmark): + pass + + +file_path = os.path.dirname(os.path.realpath(__file__)) + + +def initialize_parameters(default_model="cmp_default_model.txt"): + global file_path + bmk = Comparator(file_path, + default_model, + "keras", + prog="cmp_baseline", + desc="Meta-model to compare two models") + # Initialize parameters + gParameters = candle.finalize_parameters(bmk) + return gParameters + + +def run(gParameters): + print("COMPARATOR") + print(str(gParameters)) + global file_path + print("file_path: %s" % file_path) + output_dir = gParameters["output_dir"] + expid = gParameters["experiment_id"] + runid = gParameters["run_id"] + supervisor = Path(file_path).absolute().parent.parent + workflows = supervisor / "workflows" + #print(model_sh) + model1 = gParameters["model1"] + model2 = gParameters["model2"] + os.chdir(output_dir) + cmd = make_cmd(str(workflows), expid, runid) + run_dir = Path(os.getenv("CANDLE_DATA_DIR")) \ + / model1 / "Output" / expid / runid + #print("env: " + str(env)) + print("cmd: " + str(cmd)) + results = [] + for i in [1, 2]: + model_name = gParameters["model%i" % i] + env = make_env(str(workflows), model_name) + print("command is ", cmd, "/nenv is:", env) + with open(str(run_dir) + "/start-%i.log" % i, "w") as start_log: + subprocess.run(cmd, + env=env, + stdout=start_log, + stderr=subprocess.STDOUT) + run_dir = Path(os.getenv("CANDLE_DATA_DIR")) \ + / model_name / "Output" / expid / runid + with open(run_dir / "result.txt") as fp: + line = fp.readline() + results[i] = int(line) + print("cmp: result %i: %f" % (i, results[i])) + print("Comparator DONE.") + + +def make_env(workflows, model_name): + output_dir = "./tmp" + expid = 'one_exp' + env = { + "WORKFLOWS_ROOT": workflows, + "TURBINE_OUTPUT": output_dir, + "EXPID": expid, + "SITE": "lambda", + "OBJ_RETURN": "loss", + "BENCHMARK_TIMEOUT": "120", + "MODEL_NAME": model_name, + "CANDLE_MODEL_TYPE": "SINGULARITY", + "CANDLE_DATA_DIR": os.getenv("CANDLE_DATA_DIR"), + "ADLB_RANK_OFFSET": "0", + "CANDLE_IMAGE": "/software/improve/images/GraphDRP.sif" + } + return env + + +def make_cmd(workflows, expid, runid): + model_sh = workflows + "/common" + "/sh" + "/model.sh" + cmd = [ + "bash", + model_sh, + "keras2", + "{}", # empty JSON fragment + expid, + runid + ] + + return cmd + + +def main(): + gParameters = initialize_parameters() + run(gParameters) + + +if __name__ == "__main__": + main() diff --git a/models/Comparator/cmp_default_model.txt b/models/Comparator/cmp_default_model.txt new file mode 100644 index 00000000..9bcea1e0 --- /dev/null +++ b/models/Comparator/cmp_default_model.txt @@ -0,0 +1,6 @@ +[Global_Params] + +model_name = 'graphdrp' + +model1 = 'graphdrp' +model2 = 'graphdrp' # 'graphdrp2' diff --git a/models/OneD/README.md b/models/OneD/README.md new file mode 100644 index 00000000..c465a5b1 --- /dev/null +++ b/models/OneD/README.md @@ -0,0 +1,12 @@ +# File organization: +- Name the main file where the actual model resides as _baseline_ or <_pytorch>.py +- .py for the Benchmark class +- _default_model.txt + +Please follow the above conventions for naming files, all lowercase filenames. +`model_name` is a required keyword for all models. + +This would enable the model a user to run `python oned_baseline_keras2.py` + +Users never change parameters inside the file oned_baseline_keras2.py, any parameters needed for tweaking or optimizing the model +must be provide vi oned_default_model.txt diff --git a/models/OneD/oned.py b/models/OneD/oned.py new file mode 100644 index 00000000..ded2ebc6 --- /dev/null +++ b/models/OneD/oned.py @@ -0,0 +1,28 @@ +import candle +import os + +# Define any needed additional args to ensure all new args are command-line accessible. +additional_definitions = [{ + 'name': 'x', + 'type': float, + 'nargs': 1, + 'help': '1D function, derived form cosine mixture' +}, { + 'name': 'new_keyword', + 'type': str, + 'nargs': 1, + 'help': 'helpful description' +}] + +# Define args that are required. +required = None + + +# Extend candle.Benchmark to configure the args +class IBenchmark(candle.Benchmark): + + def set_locals(self): + if required is not None: + self.required = set(required) + if additional_definitions is not None: + self.additional_definitions = additional_definitions diff --git a/models/OneD/oned_baseline_keras2.py b/models/OneD/oned_baseline_keras2.py new file mode 100644 index 00000000..e869029b --- /dev/null +++ b/models/OneD/oned_baseline_keras2.py @@ -0,0 +1,109 @@ +import os +import candle +from oned import IBenchmark + +# Just because the tensorflow warnings are a bit verbose +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + +# This should be set outside as a user environment variable +os.environ['CANDLE_DATA_DIR'] = os.environ['HOME'] + '/improve_data_dir' + +# file_path becomes the default location of the oned_default_model.txt file +file_path = os.path.dirname(os.path.realpath(__file__)) + + +# In the initialize_parameters() method, we will instantiate the base +# class, and finally build an argument parser to recognize your customized +# parameters in addition to the default parameters.The initialize_parameters() +# method should return a python dictionary, which will be passed to the run() +# method. +def initialize_parameters(): + i_bmk = IBenchmark( + file_path, # this is the path to this file needed to find default_model.txt + 'oned_default_model.txt', # name of the default_model.txt file + 'keras', # framework, choice is keras or pytorch + prog='oned_baseline', # basename of the model + desc='IMPROVE Benchmark') + + gParameters = candle.finalize_parameters( + i_bmk) # returns the parameter dictionary built from + # default_model.txt and overwritten by any + # matching comand line parameters. + + return gParameters + + +import numpy as np +import matplotlib.pyplot as plt +import tensorflow as tf + + +def func(x, n=1): + # "func" takes in two arguments: "x" and "n", n is set to 1. + # The function returns a calculation using the input "x" and a default value of "n" equal to 1. + # The calculation is a linear combination of three trigonometric functions (sine, cosine) + # with the addition of a random normal variable scaled by the input "n". + + #y = 0.02 * x + 0.5 * np.sin(1 * x + 0.1) + 0.75 * np.cos( + # 0.25 * x - 0.3) + n * np.random.normal(0, 0.2, 1) + + # remove random part + y = 0.02 * x + 0.5 * np.sin(1 * x + 0.1) + 0.75 * np.cos(0.25 * x - 0.3) + + print("oned: f(x=%0.3f) => y=%0.3f" % (x, y)) + + return y + + +def run(params): + # fetch data + # preprocess data + # save preprocessed data + # define callbacks + # build / compile model + # train model + # infer using model + # etc + print("running third party code") + + x = params['x'] + y = func(x) + + print("IMPROVE_RESULT: " + str(y)) + + print("returning training metrics: ", y) + + h = tf.keras.callbacks.History() + h.history.setdefault('val_loss') + + y_array = np.ndarray(2) + y_array.fill(y) + h.history['val_loss'] = y_array + return h + return { + "val_loss": y, + } # metrics is used by the supervisor when running + # HPO workflows (and possible future non HPO workflows) + + # Dumping results into file, workflow requirement + val_scores = { + 'key': 'val_loss', + 'value': metrics['val_loss'], + 'val_loss': metrics['val_loss'], + } + + with open(params['output_dir'] + "/scores.json", "w", + encoding="utf-8") as f: + json.dump(val_scores, f, ensure_ascii=False, indent=4) + + return metrics # metrics is used by the supervisor when running + # HPO workflows (and possible future non HPO workflows) + + +def main(): + params = initialize_parameters() + scores = run(params) + + +if __name__ == "__main__": + main() diff --git a/models/OneD/oned_default_model.txt b/models/OneD/oned_default_model.txt new file mode 100644 index 00000000..93439811 --- /dev/null +++ b/models/OneD/oned_default_model.txt @@ -0,0 +1,3 @@ +[global] +model_name="1D" +x=1 diff --git a/models/Random/random_baseline_keras2.py b/models/Random/random_baseline_keras2.py new file mode 100644 index 00000000..4f9eff34 --- /dev/null +++ b/models/Random/random_baseline_keras2.py @@ -0,0 +1,104 @@ +""" +SUPERVISOR MODEL RANDOM +Simply returns a random number in [0,10) as val_loss +""" + +import os + +import tensorflow as tf +import numpy as np + +import candle + +# file_path becomes the default location of the oned_default_model.txt file +file_path = os.path.dirname(os.path.realpath(__file__)) + + +class BenchmarkRandom(candle.Benchmark): + """ Our subclass implementation of a CANDLE Benchmark """ + def set_locals(self): + pass + +# In the initialize_parameters() method, we will instantiate the base +# class, and finally build an argument parser to recognize your customized +# parameters in addition to the default parameters.The initialize_parameters() +# method should return a python dictionary, which will be passed to the run() +# method. +def initialize_parameters(): + bmk = BenchmarkRandom( + # The path to this file needed to find default_model.txt: + file_path, + # The name of the default_model.txt file: + 'random_default_model.txt', + 'keras', # framework, choice is keras or pytorch + prog='random_baseline', # basename of the model + desc='Supervisor Benchmark Random') + + # Get the parameter dictionary built from + # random_default_model.txt and modified by any + # matching command line parameters: + gParameters = candle.finalize_parameters(bmk) + + return gParameters + + +def model_implementation(params): + """ The implementation of the model w/o CANDLE conventions """ + + from random import random + if "crash_probability" in params: + crash_probability = float(params["crash_probability"]) + if random() < crash_probability: + raise FakeCrashException() + + result = random() * 10 + return result + + +class FakeCrashException(Exception): + """ + A dummy uncaught Exception to test error handling in Supervisor + """ + pass + + +def run(params): + + result = model_implementation(params) + + print("IMPROVE_RESULT: " + str(result)) + + h = tf.keras.callbacks.History() + h.history.setdefault('val_loss') + + y_array = np.ndarray(2) + y_array.fill(result) + h.history['val_loss'] = y_array + return h + return { + "val_loss": result, + } # metrics is used by the supervisor when running + # HPO workflows (and possible future non HPO workflows) + + # Dumping results into file, workflow requirement + val_scores = { + 'key': 'val_loss', + 'value': metrics['val_loss'], + 'val_loss': metrics['val_loss'], + } + + with open(params['output_dir'] + "/scores.json", "w", + encoding="utf-8") as f: + json.dump(val_scores, f, ensure_ascii=False, indent=4) + + return metrics # metrics is used by the supervisor when running + # HPO workflows (and possible future non HPO workflows) + + +def main(): + params = initialize_parameters() + scores = run(params) + + +if __name__ == "__main__": + main() diff --git a/python/eqpy/eqpy.py b/python/eqpy/eqpy.py index 30cfc3e7..99a8debc 100644 --- a/python/eqpy/eqpy.py +++ b/python/eqpy/eqpy.py @@ -1,5 +1,5 @@ -import threading import sys +import threading try: from queue import Queue @@ -10,9 +10,11 @@ input_q = Queue() output_q = Queue() + def OUT_put(string_params): output_q.put(string_params) + def IN_get(): global input_q result = input_q.get() diff --git a/python/hyperopt/Readme.md b/python/hyperopt/Readme.md index 028e92b6..f1132a0e 100644 --- a/python/hyperopt/Readme.md +++ b/python/hyperopt/Readme.md @@ -1,21 +1,24 @@ -# EQPy-enabled Hyperopt # +# EQPy-enabled Hyperopt Files: -* eqpy_hyperopt/ - eqpy_hyperopt python package -* tests/ - unit tests for eqpy_hyperopt +- eqpy_hyperopt/ - eqpy_hyperopt python package +- tests/ - unit tests for eqpy_hyperopt eqpy_hyperopt/hyperopt_runner.py contains code that integrates hyperopt with a swift script via eqpy get and put calls. -Initialize eqpy_hyperopt from swift with +Initialize eqpy_hyperopt from swift with + ``` EQPy_init_package(ME,"eqpy_hyperopt.hyperopt_runner") ``` + On initialization eqpy_hyperopt will put an empty string in the output queue as handshake for swift to receive. Swift should then send a string containing the hyperopt parameters. This string should be formatted as a python dictionary. For example, + ``` { 'space' : hyperopt.hp.uniform(\'x\', -2, 2), @@ -24,24 +27,25 @@ python dictionary. For example, 'param_batch_size' : 10 } ``` + The elements of the dictionary are: -* space : see https://github.com/hyperopt/hyperopt/wiki/FMin#2-defining-a-search-space - The set of possible arguments to the model. +- space : see https://github.com/hyperopt/hyperopt/wiki/FMin#2-defining-a-search-space + The set of possible arguments to the model. -* algo : search algorithm - This object, such as `hyperopt.rand.suggest` and - `hyperopt.tpe.suggest` provides logic for sequential search of the - hyperparameter space. +- algo : search algorithm + This object, such as `hyperopt.rand.suggest` and + `hyperopt.tpe.suggest` provides logic for sequential search of the + hyperparameter space. -* max_evals : int - Allow up to this many function evaluations before returning. +- max_evals : int + Allow up to this many function evaluations before returning. -* param_batch_size : int - Retrieve at most this many new parameters sets from the search - algorithm for evaluation up to max_evals. Note that the actual - number of new parameter sets to evaluate is dependent on the - search algorithm. +- param_batch_size : int + Retrieve at most this many new parameters sets from the search + algorithm for evaluation up to max_evals. Note that the actual + number of new parameter sets to evaluate is dependent on the + search algorithm. Once these are received eqpy_hyperopt will initialize hyperopt and put the first of set (up to `param_batch_size`) in size in the output queue for swift @@ -50,9 +54,11 @@ model with these parameters. The evaluation results should be returned as a "," separated string where is element is a single number. For example, + ``` -1.23434,0.42422,-0.0001 ``` + The order of the results in the results string should match the order of the parameters (i.e. the first number in the results string is the result of the first model evaluation). @@ -61,7 +67,8 @@ When the `max_evals` number of evaluations has occurred, eqpy_hyperopt will put "FINAL" in the output queue, and then put the best parameters in the output queue. -## Tests ## +## Tests + The tests test basic eqpy_hyperopt functionality by running it 'stand-alone' without any eqpy mediated interation and also using eqpy but in a pure python context. @@ -70,7 +77,8 @@ Run the unit tests from within the tests directory with `python -m unittest test_hyperopt` - Source settings.sh to set the PYTHONPATH correctly. - - ## Misc ## - Pymongo / BSON were causing issues on Cori so that's "monkey patched" by setting `hyperopt.base.have_bson = False`. +Source settings.sh to set the PYTHONPATH correctly. + +## Misc + +Pymongo / BSON were causing issues on Cori so that's "monkey patched" by setting `hyperopt.base.have_bson = False`. diff --git a/python/hyperopt/eqpy_hyperopt/hyperopt_runner.py b/python/hyperopt/eqpy_hyperopt/hyperopt_runner.py index ee97269b..be590726 100644 --- a/python/hyperopt/eqpy_hyperopt/hyperopt_runner.py +++ b/python/hyperopt/eqpy_hyperopt/hyperopt_runner.py @@ -1,18 +1,20 @@ from __future__ import print_function -import numpy as np -import eqpy -from hyperopt import base, hp +import eqpy import hyperopt +import numpy as np +from hyperopt import base, hp # monkey patch hyperopt not to use bson. We don't # use any of the pymongo / bson parts of hyperopt and # they cause issues when running on Cori's compute node base.have_bson = False + class Runner: - def __init__(self, algo, domain, max_evals, param_batch_size, trials, rstate): + def __init__(self, algo, domain, max_evals, param_batch_size, trials, + rstate): self.algo = algo self.domain = domain self.max_evals = max_evals @@ -27,13 +29,13 @@ def run(self): if n_to_enqueue + done > self.max_evals: n_to_enqueue = self.max_evals - done - #print("to enqueue {}".format(n_to_enqueue)) + # print("to enqueue {}".format(n_to_enqueue)) new_ids = self.trials.new_trial_ids(n_to_enqueue) - #print("new_ids size: {}".format(len(new_ids))) + # print("new_ids size: {}".format(len(new_ids))) self.trials.refresh() new_trials = self.algo(new_ids, self.domain, self.trials, - self.rstate.randint(2 ** 31 - 1)) + self.rstate.randint(2**31 - 1)) if len(new_trials): self.trials.insert_trial_docs(new_trials) self.trials.refresh() @@ -45,16 +47,20 @@ def run(self): self.trials.refresh() def evaluate(self): - new_trials = [t for t in self.trials._dynamic_trials if t['state'] == base.JOB_STATE_NEW] - params = [t['misc']['vals'] for t in new_trials] + new_trials = [ + t for t in self.trials._dynamic_trials + if t["state"] == base.JOB_STATE_NEW + ] + params = [t["misc"]["vals"] for t in new_trials] rvals = self.domain.fn(params) for i in range(len(new_trials)): t = new_trials[i] - t['result'] = rvals[i] - t['state'] = base.JOB_STATE_DONE + t["result"] = rvals[i] + t["state"] = base.JOB_STATE_DONE self.trials.refresh() + def eqpy_func(params): retvals = [] # unpack and send to out @@ -64,10 +70,11 @@ def eqpy_func(params): # get result and format for hyperopt result = eqpy.IN_get() split_result = result.split(",") - return [{'loss': float(x), 'status' : base.STATUS_OK} for x in split_result] + return [{"loss": float(x), "status": base.STATUS_OK} for x in split_result] + def run(): - """run function for eqpy based run""" + """run function for eqpy based run.""" eqpy.OUT_put("") # params should be formatted as a dictionary @@ -76,14 +83,22 @@ def run(): trials = base.Trials() rstate = None - if 'seed' in hp_dict: - rstate = np.random.RandomState(hp_dict['seed']) - - fmin(eqpy_func, hp_dict['space'], hp_dict['algo'], hp_dict['max_evals'], - hp_dict['param_batch_size'], trials, rstate) + if "seed" in hp_dict: + rstate = np.random.RandomState(hp_dict["seed"]) + + fmin( + eqpy_func, + hp_dict["space"], + hp_dict["algo"], + hp_dict["max_evals"], + hp_dict["param_batch_size"], + trials, + rstate, + ) eqpy.OUT_put("FINAL") eqpy.OUT_put(str(trials.argmin)) + def fmin(fn, space, algo, max_evals, param_batch_size, trials, rstate=None): """Minimize a function over a hyperparameter space. @@ -128,7 +143,8 @@ def fmin(fn, space, algo, max_evals, param_batch_size, trials, rstate=None): a trials object, then that trials object will be affected by side-effect of this call. - rstate : numpy.RandomState, default numpy.random""" + rstate : numpy.RandomState, default numpy.random + """ if rstate is None: rstate = np.random.RandomState() @@ -136,6 +152,5 @@ def fmin(fn, space, algo, max_evals, param_batch_size, trials, rstate=None): # need a domain to pass to the algorithm to provide the space domain = base.Domain(fn, space, pass_expr_memo_ctrl=None) - runner = Runner(algo, domain, max_evals, param_batch_size, - trials, rstate) + runner = Runner(algo, domain, max_evals, param_batch_size, trials, rstate) runner.run() diff --git a/python/hyperopt/tests/test_hyperopt.py b/python/hyperopt/tests/test_hyperopt.py index 98b9de12..763862e1 100644 --- a/python/hyperopt/tests/test_hyperopt.py +++ b/python/hyperopt/tests/test_hyperopt.py @@ -1,58 +1,71 @@ from __future__ import print_function -import eqpy_hyperopt.hyperopt_runner as hr -from hyperopt import hp, base, tpe, rand -import numpy as np +import ast import math - import threading -import eqpy -import ast - import unittest +import eqpy +import eqpy_hyperopt.hyperopt_runner as hr +import numpy as np +from hyperopt import base, hp, rand, tpe def math_sin_func(params): retvals = [] - #print("len params: {}".format(len(params))) + # print("len params: {}".format(len(params))) for p in params: - x = p['x'][0] + x = p["x"][0] r = math.sin(x) - retvals.append({'loss': float(r), 'status': base.STATUS_OK}) + retvals.append({"loss": float(r), "status": base.STATUS_OK}) return retvals + class TestHyperopt(unittest.TestCase): def test_simple_rand(self): - space = hp.uniform('x', -2, 2) + space = hp.uniform("x", -2, 2) max_evals = 100 trials = base.Trials() - algo = rand.suggest #tpe.suggest + algo = rand.suggest # tpe.suggest param_batch_size = 10 # if seed is changed then the test will fail rstate = np.random.RandomState(42) - hr.fmin(math_sin_func, space, algo, max_evals, - param_batch_size, trials, rstate=rstate) + hr.fmin( + math_sin_func, + space, + algo, + max_evals, + param_batch_size, + trials, + rstate=rstate, + ) self.assertEqual(len(trials.results), 100) - self.assertAlmostEqual(trials.argmin['x'], -1.5805633657891858) + self.assertAlmostEqual(trials.argmin["x"], -1.5805633657891858) def test_simple_tpe(self): - space = hp.uniform('x', -2, 2) + space = hp.uniform("x", -2, 2) max_evals = 100 trials = base.Trials() - algo = tpe.suggest #tpe.suggest + algo = tpe.suggest # tpe.suggest # note that tpe won't always return more than 1 # parameter conbimation max_parallel_param_count = 10 # if seed is changed then the test will fail rstate = np.random.RandomState(42) - hr.fmin(math_sin_func, space, algo, max_evals, - max_parallel_param_count, trials, rstate=rstate) + hr.fmin( + math_sin_func, + space, + algo, + max_evals, + max_parallel_param_count, + trials, + rstate=rstate, + ) self.assertEqual(len(trials.results), 100) - self.assertAlmostEqual(trials.argmin['x'], -1.5708577298673572) + self.assertAlmostEqual(trials.argmin["x"], -1.5708577298673572) def test_eqpy(self): p = threading.Thread(target=hr.run) @@ -66,25 +79,27 @@ def test_eqpy(self): eqpy.input_q.put(hp_params_dict) # gets initial set of candidate parameters result = eqpy.output_q.get() - while (True): + while True: # result = {'x': [1.8382913715287232]};{...} split_result = result.split(";") - rs = ",".join([str(math.sin(ast.literal_eval(r)['x'][0])) for r in split_result]) + rs = ",".join([ + str(math.sin(ast.literal_eval(r)["x"][0])) for r in split_result + ]) # iff algo is rand.suggest, then len(split_result) should # equal max_parallel_param_count self.assertEqual(len(split_result), 10) eqpy.input_q.put(rs) # get the next set of candidate parameters result = eqpy.output_q.get() - if (result == "FINAL"): + if result == "FINAL": break # get final result self.assertEqual("{'x': -1.5477895914281512}", eqpy.output_q.get()) def test_no_seed(self): - """ Tests that passing no seed to eqpy_hyperopt doesn't raise - an exception """ + """Tests that passing no seed to eqpy_hyperopt doesn't raise an + exception.""" p = threading.Thread(target=hr.run) p.start() @@ -97,21 +112,24 @@ def test_no_seed(self): eqpy.input_q.put(hp_params_dict) # gets initial set of candidate parameters result = eqpy.output_q.get() - while (True): + while True: # result = {'x': [1.8382913715287232]};{...} split_result = result.split(";") - rs = ",".join([str(math.sin(ast.literal_eval(r)['x'][0])) for r in split_result]) + rs = ",".join([ + str(math.sin(ast.literal_eval(r)["x"][0])) for r in split_result + ]) # iff algo is rand.suggest, then len(split_result) should # equal max_parallel_param_count self.assertEqual(len(split_result), 10) eqpy.input_q.put(rs) # get the next set of candidate parameters result = eqpy.output_q.get() - if (result == "FINAL"): + if result == "FINAL": break # get final result self.assertTrue(len(eqpy.output_q.get()) > 0) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/scratch/crusher/README.adoc b/scratch/crusher/README.adoc new file mode 100644 index 00000000..c38de0b1 --- /dev/null +++ b/scratch/crusher/README.adoc @@ -0,0 +1,10 @@ + +Run with + +---- +$ ./test.sh hello.swift +---- + +Wait for job to complete, then check `turbine_output/output.txt` + +Logs are in `turbine_output/` diff --git a/scratch/crusher/hello.swift b/scratch/crusher/hello.swift new file mode 100644 index 00000000..79ecd0d2 --- /dev/null +++ b/scratch/crusher/hello.swift @@ -0,0 +1,2 @@ +import io; +printf("HELLO"); diff --git a/scratch/crusher/py-tf.swift b/scratch/crusher/py-tf.swift new file mode 100644 index 00000000..87ce81e7 --- /dev/null +++ b/scratch/crusher/py-tf.swift @@ -0,0 +1,18 @@ + +import io; +import python; + +r = python(---- +import sys, traceback +try: + sys.argv = ['python'] + import torch +except Exception as e: + info = sys.exc_info() + s = traceback.format_tb(info[2]) + sys.stdout.write('EXCEPTION in Python code: \\n' + repr(e) + ' ... \\n' + ''.join(s)) + sys.stdout.write('\\n') + sys.stdout.flush() +----, + "repr(torch.__version__)"); // +printf("PyTorch version: %s", r); diff --git a/scratch/crusher/py0.swift b/scratch/crusher/py0.swift new file mode 100644 index 00000000..4b262431 --- /dev/null +++ b/scratch/crusher/py0.swift @@ -0,0 +1,7 @@ + +import io; +import python; + +i = python("print(\"python works\")", + "repr(2+2)"); +printf("result of 2+2='%s'", i); diff --git a/scratch/crusher/test.sh b/scratch/crusher/test.sh new file mode 100755 index 00000000..e28c61fe --- /dev/null +++ b/scratch/crusher/test.sh @@ -0,0 +1,31 @@ +#!/bin/bash -l +set -eu + +if (( ${#} != 1 )) +then + echo "Provide the workflow!" + exit 1 +fi + +WORKFLOW=$1 + +MED106=/gpfs/alpine/world-shared/med106 +SWIFT=/gpfs/alpine/world-shared/med106/gounley1/crusher2/swift-t-install + +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH + +PY=/gpfs/alpine/world-shared/med106/gounley1/crusher2/conda520 + +which swift-t + +export PROJECT=MED106_crusher +export QUEUE=batch +export WALLTIME=00:05:00 +export PROCS=2 +export PPN=2 + +export TURBINE_LAUNCHER=srun + +set -x +swift-t -m slurm -n $PROCS -e PYTHONHOME=$PY $WORKFLOW diff --git a/scratch/csv2f64/f64_2csv.c b/scratch/csv2f64/f64_2csv.c index 0cdf2c29..43c74005 100644 --- a/scratch/csv2f64/f64_2csv.c +++ b/scratch/csv2f64/f64_2csv.c @@ -37,7 +37,7 @@ main(int argc, char* argv[]) int total_rows, total_cols; sscanf(argv[3], "%i", &total_rows); sscanf(argv[4], "%i", &total_cols); - + bool result = convert(input, output, total_rows, total_cols); if (!result) { @@ -93,7 +93,7 @@ convert_fps(FILE* fp_i, FILE* fp_o, int total_rows, int total_cols) int cols = 1; // current column counter int rows = 1; // current row counter size_t actual_r = 0; // actual number of items read at last fread - int offset = 0; // starting offset in chars + int offset = 0; // starting offset in chars bool b; int i = 0; while (true) @@ -119,7 +119,7 @@ convert_fps(FILE* fp_i, FILE* fp_o, int total_rows, int total_cols) fprintf(fp_o, "\n"); rows++; } - + free(chars); free(floats); return true; @@ -185,4 +185,3 @@ format_double(int total_cols, double value, return true; } - diff --git a/scratch/csv2f64/f64_2hdf.py b/scratch/csv2f64/f64_2hdf.py index 9e1bc741..fdae655f 100644 --- a/scratch/csv2f64/f64_2hdf.py +++ b/scratch/csv2f64/f64_2hdf.py @@ -1,30 +1,27 @@ #!/usr/bin/env python3 +import argparse import sys import h5py import numpy as np -import argparse parser = argparse.ArgumentParser() -parser.add_argument('input', - help='The input f64 file') -parser.add_argument('output', - help='The output HDF file') +parser.add_argument("input", help="The input f64 file") +parser.add_argument("output", help="The output HDF file") args = parser.parse_args(sys.argv[1:]) print(args.input, args.output) - -f = h5py.File(args.output, 'r+') +f = h5py.File(args.output, "r+") print(f.keys()) -ds = f['conv1d_1']['conv1d_1']['kernel:0'] -a = ds[:,:,:] +ds = f["conv1d_1"]["conv1d_1"]["kernel:0"] +a = ds[:, :, :] # print(ds.shape) # print(ds.dtype) -a8 = a.astype('float64') +a8 = a.astype("float64") # print(a[0,0,0]) -a8 = np.fromfile(args.input, dtype='float64') +a8 = np.fromfile(args.input, dtype="float64") diff --git a/scratch/csv2f64/hdf2f64.py b/scratch/csv2f64/hdf2f64.py index e66feaca..3b111c4d 100644 --- a/scratch/csv2f64/hdf2f64.py +++ b/scratch/csv2f64/hdf2f64.py @@ -1,27 +1,24 @@ #!/usr/bin/env python3 +import argparse import sys import h5py -import argparse parser = argparse.ArgumentParser() -parser.add_argument('input', - help='The input H5 file') -parser.add_argument('output', - help='The output f64 file') +parser.add_argument("input", help="The input H5 file") +parser.add_argument("output", help="The output f64 file") args = parser.parse_args(sys.argv[1:]) print(args) -f = h5py.File(args.input, 'r') +f = h5py.File(args.input, "r") print(f.keys()) -ds = f['conv1d_1']['conv1d_1']['kernel:0'] -a = ds[:,:,:] +ds = f["conv1d_1"]["conv1d_1"]["kernel:0"] +a = ds[:, :, :] # print(ds.shape) # print(ds.dtype) -a8 = a.astype('float64') -# print(a[0,0,0]) +a8 = a.astype("float64") a8.tofile(args.output) diff --git a/scratch/csv2f64/inject-noise.py b/scratch/csv2f64/inject-noise.py index 7079c86f..3d263936 100644 --- a/scratch/csv2f64/inject-noise.py +++ b/scratch/csv2f64/inject-noise.py @@ -1,15 +1,14 @@ #!/usr/bin/env python3 -import random, sys +import argparse +import random +import sys import numpy as np -import argparse parser = argparse.ArgumentParser() -parser.add_argument('file', - help='The file to modify') -parser.add_argument('rate', - help='The fraction to modify') +parser.add_argument("file", help="The file to modify") +parser.add_argument("rate", help="The fraction to modify") args = parser.parse_args(sys.argv[1:]) @@ -17,9 +16,9 @@ rate = float(args.rate) -a8 = np.fromfile(args.file, dtype='float64') -print('input size: ', a8.shape[0]) -print('flip pct: ', rate, '%') +a8 = np.fromfile(args.file, dtype="float64") +print("input size: ", a8.shape[0]) +print("flip pct: ", rate, "%") rate = rate / 100 flips = 0 @@ -30,4 +29,4 @@ flips += 1 a8.tofile(args.file) -print('flipped: ', flips) +print("flipped: ", flips) diff --git a/scratch/csv2f64/test/data-4x3.csv b/scratch/csv2f64/test/data-4x3.csv index 0eb6ecb8..551dac9a 100644 --- a/scratch/csv2f64/test/data-4x3.csv +++ b/scratch/csv2f64/test/data-4x3.csv @@ -2,5 +2,3 @@ 0.2,5,100 6,5,10 70,6.2,-2 - - diff --git a/scratch/csv2f64/test/data-5x3.csv b/scratch/csv2f64/test/data-5x3.csv index 2b7e069f..b1398260 100644 --- a/scratch/csv2f64/test/data-5x3.csv +++ b/scratch/csv2f64/test/data-5x3.csv @@ -3,4 +3,3 @@ 6,-16.5,10 70,6.2,-2 42,-32,22 - diff --git a/scratch/csv2f64/test/err-4x3-1.csv b/scratch/csv2f64/test/err-4x3-1.csv index a73da841..e4583f91 100644 --- a/scratch/csv2f64/test/err-4x3-1.csv +++ b/scratch/csv2f64/test/err-4x3-1.csv @@ -2,5 +2,3 @@ 0.2,,5,100 6,5,10 70,6.2,-2 - - diff --git a/scratch/csv2f64/test/err-4x3-2.csv b/scratch/csv2f64/test/err-4x3-2.csv index a77de647..ae6a53f8 100644 --- a/scratch/csv2f64/test/err-4x3-2.csv +++ b/scratch/csv2f64/test/err-4x3-2.csv @@ -2,5 +2,3 @@ 0.2,,5,100 6,5,10x 70,6.2,-2 - - diff --git a/scratch/fake-lbann/test_1.py b/scratch/fake-lbann/test_1.py index 4d1c9d75..9f1e4500 100644 --- a/scratch/fake-lbann/test_1.py +++ b/scratch/fake-lbann/test_1.py @@ -1,4 +1,5 @@ import os + import fl_interface comm = os.getenv("COMM") diff --git a/scratch/histawk/hist.awk b/scratch/histawk/hist.awk index f968525e..e05c75f5 100644 --- a/scratch/histawk/hist.awk +++ b/scratch/histawk/hist.awk @@ -12,7 +12,7 @@ BEGIN { } } -{ +{ C[$0] = C[$0]+1; } diff --git a/scratch/horovod/horovod-1.py b/scratch/horovod/horovod-1.py index cddb5c20..de2326a1 100644 --- a/scratch/horovod/horovod-1.py +++ b/scratch/horovod/horovod-1.py @@ -1,2 +1,2 @@ horovod = "/home/wozniak/proj/horovod" -execfile(horovod+"/examples/keras_mnist.py") +execfile(horovod + "/examples/keras_mnist.py") diff --git a/scratch/horovod/test-2.swift b/scratch/horovod/test-2.swift index 81ead2f7..23ed8569 100644 --- a/scratch/horovod/test-2.swift +++ b/scratch/horovod/test-2.swift @@ -6,4 +6,4 @@ int exitcode = @par=2 launch("python", a1); printf("%i", exitcode); string a2[] = [ "/home/nick/Documents/repos/horovod/examples/keras_mnist.py", "Instance_2" ]; -int e2 = @par=2 launch("python", a2); \ No newline at end of file +int e2 = @par=2 launch("python", a2); diff --git a/scratch/horovod2/test-2.C b/scratch/horovod2/test-2.C index 7d6a7ee5..e49e5253 100644 --- a/scratch/horovod2/test-2.C +++ b/scratch/horovod2/test-2.C @@ -5,8 +5,7 @@ #include "controller.h" -int -main() +int main() { printf("OK\n"); return 0; diff --git a/scratch/horovod2/test-2.c b/scratch/horovod2/test-2.c index 7d6a7ee5..e49e5253 100644 --- a/scratch/horovod2/test-2.c +++ b/scratch/horovod2/test-2.c @@ -5,8 +5,7 @@ #include "controller.h" -int -main() +int main() { printf("OK\n"); return 0; diff --git a/scratch/horovod2/test-5-1.py b/scratch/horovod2/test-5-1.py index 1a1d09fc..ea84ab82 100644 --- a/scratch/horovod2/test-5-1.py +++ b/scratch/horovod2/test-5-1.py @@ -1,19 +1,18 @@ - # TEST 5-1 from __future__ import print_function print("TEST 5-1 PY") +import math + +import horovod.keras as hvd import keras +import tensorflow as tf +from keras import backend as K from keras.datasets import mnist +from keras.layers import Conv2D, Dense, Dropout, Flatten, MaxPooling2D from keras.models import Sequential -from keras.layers import Dense, Dropout, Flatten -from keras.layers import Conv2D, MaxPooling2D -from keras import backend as K -import math -import tensorflow as tf -import horovod.keras as hvd # Horovod: initialize Horovod. hvd.init() diff --git a/scratch/indices/i1.swift b/scratch/indices/i1.swift index 173ec82a..07347170 100644 --- a/scratch/indices/i1.swift +++ b/scratch/indices/i1.swift @@ -29,10 +29,9 @@ printf("A:") => printf("B[\"%s\"]=%i", k0, v0); C[string2int(k0)] = v0; } - + foreach v1, k1 in C { printf("C[%i]=%i", k1, v1); } } - diff --git a/scratch/launch-opts/README.adoc b/scratch/launch-opts/README.adoc index 468b5413..7a287a85 100644 --- a/scratch/launch-opts/README.adoc +++ b/scratch/launch-opts/README.adoc @@ -4,7 +4,7 @@ Edit +example.sh+ to select a +TURBINE_LAUNCH_OPTIONS+. Then: ---- -$ ./example.sh +$ ./example.sh TURBINE-THETA SCRIPT NODES=2 PROCS=2 @@ -15,7 +15,7 @@ running qsub ... Job routed to queue "debug-cache-quad". Memory mode set to cache quad for queue debug-cache-quad JOB_ID=161050 -$ +$ ---- Wait for job completion. Then: diff --git a/scratch/load/load.py b/scratch/load/load.py index 96613588..d14e5001 100644 --- a/scratch/load/load.py +++ b/scratch/load/load.py @@ -1,11 +1,10 @@ - # LOAD PY # Convert the date/time markers in the extracted start/stop times # into plottable data -from datetime import datetime import sys +from datetime import datetime load = 0 D = [] @@ -14,15 +13,18 @@ print("usage: load.py START STOP < INPUT") exit(1) + def parse(d): return datetime.strptime(d, "%Y-%m-%d %H:%M:%S") + def emit(d, old_load, load): print("%0.2f %03i" % (d.timestamp() - ts_start, old_load)) print("%0.2f %03i" % (d.timestamp() - ts_start, load)) + start = parse(sys.argv[1]) -stop = parse(sys.argv[2]) +stop = parse(sys.argv[2]) ts_start = start.timestamp() diff --git a/scratch/py-eval/Makefile b/scratch/py-eval/Makefile index a0830b06..4f4add31 100644 --- a/scratch/py-eval/Makefile +++ b/scratch/py-eval/Makefile @@ -20,23 +20,39 @@ MPI_ENABLED = 0 # Cori # Module tensorflow/intel-head -PYTHON_HOME = /usr/common/software/tensorflow/intel-tensorflow/head -PYTHON_VERSION_MAJOR = 2 -PYTHON_VERSION_MINOR = 7 +# PYTHON_HOME = /usr/common/software/tensorflow/intel-tensorflow/head +# PYTHON_VERSION_MAJOR = 2 +# PYTHON_VERSION_MINOR = 7 +# PYTHON_VERSION_SUFFIX = +# CC = gcc # module load gcc + +CC = gcc + +# Dunedin 3.8 Spack +PYTHON_HOME = /usr +PYTHON_VERSION_MAJOR = 3 +PYTHON_VERSION_MINOR = 8 PYTHON_VERSION_SUFFIX = -CC = gcc # module load gcc -# Dunedin +# Dunedin 3.7.3 TF +# PYTHON_HOME = ${HOME}/Public/sfw/Python-3.7.3-tf +# PYTHON_VERSION_MAJOR = 3 +# PYTHON_VERSION_MINOR = 7 +# PYTHON_VERSION_SUFFIX = m + +# Dunedin 2.7 # PYTHON_HOME = /usr # PYTHON_VERSION_MAJOR = 2 # PYTHON_VERSION_MINOR = 7 # PYTHON_VERSION_SUFFIX = + # End Python settings PYTHON_VERSION = $(PYTHON_VERSION_MAJOR).$(PYTHON_VERSION_MINOR)$(PYTHON_VERSION_SUFFIX) INCLUDES = -I $(PYTHON_HOME)/include/python$(PYTHON_VERSION) -LIBS = -L $(PYTHON_HOME)/lib -lpython$(PYTHON_VERSION) +# LIBS = -L $(PYTHON_HOME)/lib +LIBS += -lpython$(PYTHON_VERSION) -ldl RPATHS = -Wl,-rpath -Wl,$(PYTHON_HOME)/lib DEFINES = -DPYTHON_VERSION_MAJOR=$(PYTHON_VERSION_MAJOR) \ diff --git a/scratch/py-eval/py-eval.c b/scratch/py-eval/py-eval.c index 5289ec1b..edeec357 100644 --- a/scratch/py-eval/py-eval.c +++ b/scratch/py-eval/py-eval.c @@ -5,6 +5,8 @@ #include +#include + #include #include @@ -57,6 +59,18 @@ python_init() { if (initialized) return true; verbose("python: initializing..."); + + + char str_python_lib[32]; +#ifdef _WIN32 + sprintf(str_python_lib, "lib%s.dll", PYTHON_NAME); +#elif defined __unix__ + sprintf(str_python_lib, "lib%s.so", "python3.8"); +#elif defined __APPLE__ + sprintf(str_python_lib, "lib%s.dylib", PYTHON_NAME); +#endif + dlopen(str_python_lib, RTLD_NOW | RTLD_GLOBAL); + Py_InitializeEx(1); main_module = PyImport_AddModule("__main__"); if (main_module == NULL) return handle_python_exception(); @@ -65,6 +79,20 @@ python_init() local_dict = PyDict_New(); if (local_dict == NULL) return handle_python_exception(); initialized = true; + + // long val = 43; + char* val = "MY VALUE!"; + // if (PyDict_SetItemString(main_dict, "myvar", PyLong_FromLong(val))) { + if (PyDict_SetItemString(main_dict, "myvar", val)) { + assert(false); + } + + char* result; + PyObject* po = PyDict_GetItemString(main_dict, "myvar"); + int pc = PyArg_Parse(po, "s", &result); + if (pc != 1) return handle_python_non_string(po); + printf("result: %s\n", result); + return true; } diff --git a/scratch/py-eval/py/err.py b/scratch/py-eval/py/err.py index 3892497c..aa2bdf66 100644 --- a/scratch/py-eval/py/err.py +++ b/scratch/py-eval/py/err.py @@ -1 +1 @@ -2+ +# 2+ diff --git a/scratch/py-eval/py/import-stringio.py b/scratch/py-eval/py/import-stringio.py index 2f93186a..0e4adf21 100644 --- a/scratch/py-eval/py/import-stringio.py +++ b/scratch/py-eval/py/import-stringio.py @@ -1,3 +1,5 @@ from StringIO import StringIO + + def get_string_io(): return StringIO() diff --git a/scratch/py-eval/py/numpy-array.py b/scratch/py-eval/py/numpy-array.py index 99f8c648..676903b5 100644 --- a/scratch/py-eval/py/numpy-array.py +++ b/scratch/py-eval/py/numpy-array.py @@ -1,2 +1 @@ A = numpy.array(3) - diff --git a/scratch/py-eval/py/numpy-print-A.py b/scratch/py-eval/py/numpy-print-A.py index 089fd8aa..c757e0ac 100644 --- a/scratch/py-eval/py/numpy-print-A.py +++ b/scratch/py-eval/py/numpy-print-A.py @@ -1,3 +1 @@ print(A) - - diff --git a/scratch/resizer/resize.py b/scratch/resizer/resize.py new file mode 100644 index 00000000..ecd6275b --- /dev/null +++ b/scratch/resizer/resize.py @@ -0,0 +1,90 @@ +# RESIZE PY + +description = "Resize and/or add noise to CSV data." + + +def parse_args(): + import argparse + + parser = argparse.ArgumentParser(description=description) + parser.add_argument( + "--resize", + action="store", + default=1.0, + help=""" + Output size scale compared to input size as float. + Examples: + 1.0=same size, + 0.5=half size, + 2.0=double size.""", + ) + parser.add_argument( + "--noise", + action="store", + default=0.0, + help="""" + Noise injection as float. + Examples: + 0.0=no noise + 0.1=noise +/- 10%""", + ) + parser.add_argument("input", action="store", help="The input CSV.") + parser.add_argument("output", action="store", help="The output CSV.") + args = parser.parse_args() + argvars = vars(args) + # print(str(argvars)) + return argvars + + +def write_data(args, fp, data_out): + from random import random + + wholes = int(float(args["resize"])) + noise = float(args["noise"]) + rows, cols = data_out.shape + for i in range(0, wholes): + for row in range(0, rows): + for col in range(0, cols - 1): + value = data_out[row, col] + if noise != 0.0: + value = value * (1 - noise) + value * (noise * 2) * random() + fp.write("%f," % value) + col += 1 + value = data_out[row, col] + if noise != 0.0: + value = value * (1 - noise) + value * (noise * 2) * random() + fp.write("%f" % value) + fp.write("\n") + fraction = float(args["resize"]) - wholes + for row in range(0, int(fraction * rows)): + for col in range(0, cols - 1): + value = data_out[row, col] + if noise != 0.0: + value = value * (1 - noise) + value * (noise * 2) * random() + fp.write("%f," % value) + col += 1 + value = data_out[row, col] + if noise != 0.0: + value = value * (1 - noise) + value * (noise * 2) * random() + fp.write("%f" % value) + fp.write("\n") + + +import sys + +import numpy as np + +args = parse_args() + +data_in = np.loadtxt(args["input"], delimiter=",") +data_out = np.copy(data_in) + +if args["output"] == "/dev/stdout" or args["output"] == "-": + fp = sys.stdout +else: + fp = open(args["output"], "w") + +write_data(args, fp, data_out) + +if fp is not sys.stdout: + fp.close() diff --git a/scratch/resizer/sample-1.csv b/scratch/resizer/sample-1.csv new file mode 100644 index 00000000..d3494f6d --- /dev/null +++ b/scratch/resizer/sample-1.csv @@ -0,0 +1,3 @@ +1,2,3 +4,5,6 +7,8,9 diff --git a/scratch/sbcast/README.adoc b/scratch/sbcast/README.adoc new file mode 100644 index 00000000..1f8edaa8 --- /dev/null +++ b/scratch/sbcast/README.adoc @@ -0,0 +1,26 @@ + +Two test cases to broadcast a file to `/dev/shm` on all compute nodes. +Both cases run the same Swift workflow (!), they just use different external settings +to move the data. + +== sbcast + +Usage: +---- +$ ./sbcast-1.sh +---- + +Inserts an `sbcast` command into the `turbine-slurm.sh` script for execution +just before the workflow starts. Does this via `TURBINE_PRELAUNCH` +(https://swift-lang.github.io/swift-t/sites.html#turbine_prelaunch[guide]). + +== MPI-IO + +Usage: +---- +$ ./mpi-io.sh +---- + +Uses a Turbine leader hook +(http://swift-lang.github.io/swift-t/guide.html#hooks[guide]) +to use MPI-IO to make the file copy. diff --git a/scratch/sbcast/hook-1.tcl b/scratch/sbcast/hook-1.tcl new file mode 100644 index 00000000..f835145b --- /dev/null +++ b/scratch/sbcast/hook-1.tcl @@ -0,0 +1,36 @@ + +# HOOK TCL +# This code runs on each leader rank, +# i.e., once per node. + +# Set a root data directory +set root $env(HOME)/data +puts "HOOK HOST: [exec hostname]" + +# Get the leader communicator from ADLB +set comm [ adlb::comm_get leaders ] +# Get my rank among the leaders +set rank [ adlb::comm_rank $comm ] + +# If I am rank=0, construct the list of files to copy +set EXPORTED_DATA_DIR /ccs/home/hm0/med106_proj/Benchmarks/Pilot1/Uno +set EXPORTED_DATA_FILE top_21_auc_1fold.uno.h5 + +if { $rank == 0 } { + set files [ list $EXPORTED_DATA_DIR/$EXPORTED_DATA_FILE ] + puts "files: $files" +} + +# Broadcast the file list to all leaders +turbine::c::bcast $comm 0 files + +# Make a node-local data directory +set LOCAL_PREFIX /dev/shm + +# Copy each file to the node-local directory +foreach f $files { + if { $rank == 0 } { + puts "copying: $f" + } + turbine::c::copy_to $comm $f $LOCAL_PREFIX +} diff --git a/scratch/sbcast/mpi-io.sh b/scratch/sbcast/mpi-io.sh new file mode 100755 index 00000000..1b4a8492 --- /dev/null +++ b/scratch/sbcast/mpi-io.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -eu + +# Add Swift/T to PATH +ROOT=/autofs/nccs-svm1_home1/wozniak/Public/sfw/frontier +SWIFT=$ROOT/swift-t/2023-02-23 +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH +# Add Python to PATH +PY=/gpfs/alpine/med106/proj-shared/hm0/candle_tf_2.10 +PATH=$PY/bin:$PATH + +# Set up data +EXPORTED_DATA_DIR="/ccs/home/hm0/med106_proj/Benchmarks/Pilot1/Uno" +EXPORTED_DATA_FILE="top_21_auc_1fold.uno.h5" + +# Scheduler settings +export PROJECT=MED106 + +THIS=$( realpath . ) + +# Run the workflow! +swift-t -m slurm \ + -e TURBINE_LEADER_HOOK_STARTUP="$( sed 's/#.*//;s/$/;/' $THIS/hook-1.tcl )" \ + sbcast-1.swift diff --git a/scratch/sbcast/sbcast-1.sh b/scratch/sbcast/sbcast-1.sh new file mode 100755 index 00000000..f6cfb98c --- /dev/null +++ b/scratch/sbcast/sbcast-1.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -eu + +# Add Swift/T to PATH +ROOT=/autofs/nccs-svm1_home1/wozniak/Public/sfw/frontier +SWIFT=$ROOT/swift-t/2023-02-23 +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH +# Add Python to PATH +PY=/gpfs/alpine/med106/proj-shared/hm0/candle_tf_2.10 +PATH=$PY/bin:$PATH + +# Set up data +EXPORTED_DATA_DIR="/ccs/home/hm0/med106_proj/Benchmarks/Pilot1/Uno" +EXPORTED_DATA_FILE="top_21_auc_1fold.uno.h5" + +# Scheduler settings +export PROJECT=MED106 +export TURBINE_PRELAUNCH="sbcast -f -F 8 $EXPORTED_DATA_DIR/$EXPORTED_DATA_FILE /dev/shm/$EXPORTED_DATA_FILE" + +# Run the workflow! +swift-t -m slurm sbcast-1.swift diff --git a/scratch/sbcast/workflow.swift b/scratch/sbcast/workflow.swift new file mode 100644 index 00000000..5b4f33bb --- /dev/null +++ b/scratch/sbcast/workflow.swift @@ -0,0 +1,13 @@ + +import io; +import sys; + +app ls(string dir) +{ + "ls" dir ; +} + +local_prefix = "/dev/shm"; + +printf("local_prefix: '%s'", local_prefix) => + ls(local_prefix); diff --git a/scratch/spock/README.adoc b/scratch/spock/README.adoc new file mode 100644 index 00000000..c38de0b1 --- /dev/null +++ b/scratch/spock/README.adoc @@ -0,0 +1,10 @@ + +Run with + +---- +$ ./test.sh hello.swift +---- + +Wait for job to complete, then check `turbine_output/output.txt` + +Logs are in `turbine_output/` diff --git a/scratch/spock/hello.swift b/scratch/spock/hello.swift new file mode 100644 index 00000000..79ecd0d2 --- /dev/null +++ b/scratch/spock/hello.swift @@ -0,0 +1,2 @@ +import io; +printf("HELLO"); diff --git a/scratch/spock/py-tf.swift b/scratch/spock/py-tf.swift new file mode 100644 index 00000000..d48ef6a4 --- /dev/null +++ b/scratch/spock/py-tf.swift @@ -0,0 +1,18 @@ + +import io; +import python; + +r = python(---- +import sys, traceback +try: + sys.argv = ['python'] + import tensorflow as tf +except Exception as e: + info = sys.exc_info() + s = traceback.format_tb(info[2]) + sys.stdout.write('EXCEPTION in Python code: \\n' + repr(e) + ' ... \\n' + ''.join(s)) + sys.stdout.write('\\n') + sys.stdout.flush() +----, + "repr(tf.__version__)"); // +printf("TensorFlow version: %s", r); diff --git a/scratch/spock/py0.swift b/scratch/spock/py0.swift new file mode 100644 index 00000000..4b262431 --- /dev/null +++ b/scratch/spock/py0.swift @@ -0,0 +1,7 @@ + +import io; +import python; + +i = python("print(\"python works\")", + "repr(2+2)"); +printf("result of 2+2='%s'", i); diff --git a/scratch/spock/test.sh b/scratch/spock/test.sh new file mode 100755 index 00000000..aa591d17 --- /dev/null +++ b/scratch/spock/test.sh @@ -0,0 +1,32 @@ +#!/bin/bash -l +set -eu + +if (( ${#} != 1 )) +then + echo "Provide the workflow!" + exit 1 +fi + +WORKFLOW=$1 + +MED106=/gpfs/alpine/world-shared/med106 +ROOT=$MED106/sw/spock/gcc-10.3.0 +SWIFT=$ROOT/swift-t/2021-10-05 + +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH + +PY=/gpfs/alpine/med106/world-shared/hsyoo/spock_tf2_py37_rocm42 + +which swift-t + +export PROJECT=MED106 +export QUEUE=ecp +export WALLTIME=00:05:00 +export PROCS=2 +export PPN=2 + +export TURBINE_LAUNCHER=srun + +set -x +swift-t -m slurm -n $PROCS -e PYTHONHOME=$PY $WORKFLOW diff --git a/scratch/summit/README.adoc b/scratch/summit/README.adoc index be05a9ea..90236df4 100644 --- a/scratch/summit/README.adoc +++ b/scratch/summit/README.adoc @@ -4,11 +4,17 @@ This is a stand-alone Swift/T test for Summit. Use ---- -./workflow.sh hello.swift +$ ./workflow.sh hello.swift ---- or ---- -./workflow.sh pyr.swift +$ ./workflow.sh pyr.swift +---- + +For an interactive test, get on a batch node and run: + +---- +$ ./workflow-interactive.sh hello.swift ---- diff --git a/scratch/summit/py-tf.swift b/scratch/summit/py-tf.swift new file mode 100644 index 00000000..e41a689c --- /dev/null +++ b/scratch/summit/py-tf.swift @@ -0,0 +1,17 @@ + +import io; +import python; + +result_python = python(""" +import sys, traceback +try: + sys.argv = [ 'python' ] + import tensorflow as tf +except Exception as e: + info = sys.exc_info() + s = traceback.format_tb(info[2]) + print(str(e) + ' ... \\n' + ''.join(s)) + sys.stdout.flush() +""", + "repr(40+2)"); +printf("result_python: %s", result_python); diff --git a/scratch/summit/workflow-interactive.sh b/scratch/summit/workflow-interactive.sh new file mode 100755 index 00000000..7cca3158 --- /dev/null +++ b/scratch/summit/workflow-interactive.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -eu + +# WORKFLOW INTERACTIVE SH + +if [[ ${#} != 1 ]] +then + echo "Specify a Swift script!" + exit 1 +fi +SCRIPT=$1 + +MED106=/gpfs/alpine/world-shared/med106 +SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-10-22 +PATH=$SWIFT/stc/bin:$PATH + +# This is for an interactive run: +export TURBINE_LAUNCHER=jsrun +export TURBINE_LAUNCH_OPTIONS="-r 4" + +set -x +swift-t -n $PROCS $SCRIPT diff --git a/scratch/summit/workflow.sh b/scratch/summit/workflow.sh index 42aa99f9..f9f7a666 100755 --- a/scratch/summit/workflow.sh +++ b/scratch/summit/workflow.sh @@ -8,21 +8,17 @@ then fi SCRIPT=$1 -SWIFT= - -module load spectrum-mpi/10.3.1.2-20200121 - -G=/sw/summit/gcc/6.4.0/lib64 -R="" -LD_LIBRARY_PATH=$G:$R:$LD_LIBRARY_PATH +THIS=$( readlink --canonicalize $( dirname $0 ) ) +SV=$( readlink --canonicalize $THIS/../.. ) +source $SV/workflows/common/sh/env-summit-tf-2.4.1.sh +# Basic Swift/T environment settings: export PROJECT=MED106 -# export QUEUE=debug export PPN=2 -PROCS=4 +PROCS=2 -SWIFT=/gpfs/alpine/world-shared/med106/wozniak/sw/gcc-6.4.0/swift-t/2020-03-31-c/stc/bin/swift-t +which swift-t set -x -$SWIFT -m lsf -n $PROCS \ +swift-t -p -m lsf -n $PROCS -e PYTHONHOME \ $SCRIPT diff --git a/scratch/swift-tests/app-py.swift b/scratch/swift-tests/app-py.swift new file mode 100644 index 00000000..4b4b21a8 --- /dev/null +++ b/scratch/swift-tests/app-py.swift @@ -0,0 +1,6 @@ +app p() +{ + "./fake-model.sh" "hi" "bye"; +} + +p(); diff --git a/scratch/swift-tests/fake-model.py b/scratch/swift-tests/fake-model.py new file mode 100644 index 00000000..480627a8 --- /dev/null +++ b/scratch/swift-tests/fake-model.py @@ -0,0 +1,3 @@ +# import something ? + +print("fake-model.py: python works") diff --git a/scratch/swift-tests/fake-model.sh b/scratch/swift-tests/fake-model.sh new file mode 100755 index 00000000..3e49a186 --- /dev/null +++ b/scratch/swift-tests/fake-model.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# source langs-app-X.sh + +echo "fake-model.sh: PWD=$PWD" + +python fake-model.py diff --git a/scripts/shrink-log-single.sh b/scripts/shrink-log-single.sh new file mode 100755 index 00000000..26b980eb --- /dev/null +++ b/scripts/shrink-log-single.sh @@ -0,0 +1,33 @@ +#!/bin/sh +set -eu + +# SHRINK LOG SINGLE SH +# Called by shrink-logs.mk + +INPUT=$1 +OUTPUT=$2 + +NAME=$( basename --suffix=.txt $INPUT ) + +# Temp file for tr output: +T=$( mktemp --tmpdir=$TMP_SHRINK --suffix .txt tr-XXX ) + +if [ $INPUT == $T ] +then + echo "shrink-log-single.sh: ERROR: INPUT is wrong." + exit 1 +fi + +if [ "${THIS:-}" == "" ] +then + THIS=$( readlink --canonicalize $( dirname $0 ) ) +fi + +# This converts the TensorFlow line overwrite behavior to +# normal newlines: +tr "\r" "\n" < $INPUT > $T + +# Does the log parsing and shrinking: +python $THIS/shrink-log.py $T $OUTPUT + +rm $T diff --git a/scripts/shrink-log.mk b/scripts/shrink-log.mk new file mode 100644 index 00000000..f218bc03 --- /dev/null +++ b/scripts/shrink-log.mk @@ -0,0 +1,17 @@ + +.DELETE_ON_ERROR: + +# Logs on stdout from Python runs +OUTS = $(wildcard out-*.txt) +# Logs in model.log from containers +RUNS = $(shell find . -name model.log) + +SUMMARIES = $(subst out-,summary-,$(OUTS)) $(subst model,summary,$(RUNS)) + +all: $(SUMMARIES) + +summary-%.txt: out-%.txt + @ ${THIS}/shrink-log-single.sh $(<) $(@) + +%/summary.log: %/model.log + @ ${THIS}/shrink-log-single.sh $(<) $(@) diff --git a/scripts/shrink-log.py b/scripts/shrink-log.py new file mode 100644 index 00000000..8e94bbe8 --- /dev/null +++ b/scripts/shrink-log.py @@ -0,0 +1,105 @@ +# SHRINK LOG PY +# argv: 2 filenames : tr file and summary-*.txt +# Called by shrink-log-single.sh +# The tr file should have used tr to change CR to NL +# Removes non-printing characters (backspace) +# Reduces the number of training lines in output +# Removes redundant batch size information +# Fixes newline before "Current time" report + +import os +import re +import stat +import sys +import time +from collections import deque + +# Only 1/shrink_factor training lines are copied +shrink_factor = 100 +# Number of additional consecutive lines at beginning and end of +# training that are retained +hold_space = 3 + + +def shrink(fp_in, fp_out): + # Queue to hold extra lines that may be printed at end of run + Q = deque() + index = 0 + starts = 0 # Initial hold_space ETAs are immediately printed + line_previous = "" + for line in fp_in: + if len(line) == 1: + continue # Blank line + line = line.replace("\b", "") + if "batch:" in line or "Current" in line: + # Found a training line + line = re.sub("- batch: .* 32.0000 -", "", line) + line = line.replace("Current", "\nCurrent") + if starts < hold_space: + fp_out.write(line) + starts += 1 + continue + Q.append(line) + index += 1 + if len(Q) > hold_space: + line = Q.popleft() + if index % shrink_factor == 0: + fp_out.write(line) + else: + starts = 0 + # Found a non-training line + # Flush the Q: + while len(Q) > 0: + fp_out.write(Q.popleft()) + if line == line_previous: + # Discard redundant lines + continue + # Good line: write it + fp_out.write(line) + line_previous = line + # Done: flush Q: + while len(Q) > 0: + fp_out.write(Q.popleft()) + + +# From https://www.codegrepper.com/code-examples/python/python+get+human+readable+file+size +def hsize(size, decimal_places=2): + if size < 1024: + return "%4i B" % size + size /= 1024 + for unit in ["KB", "MB", "GB", "TB"]: + if size < 1024: + break + size /= 1024 + return f"{size:.{decimal_places}f} {unit}" + + +file_in = sys.argv[1] +file_out = sys.argv[2] + +# Do not process files that have not changed since the last run +# of this script: +if os.path.exists( + file_out) and os.path.getmtime(file_in) < os.path.getmtime(file_out): + print("skipping: " + file_in) + exit() + +t0 = time.time() +s0 = os.stat(file_in) +z0 = s0[stat.ST_SIZE] +h0 = hsize(z0) +print("shrink: %11s %s" % (h0, file_out)) + +with open(file_in, "r") as fp_in: + with open(file_out, "w") as fp_out: + shrink(fp_in, fp_out) + +s1 = os.stat(file_out) +t1 = time.time() +z1 = s1[stat.ST_SIZE] + +t = t1 - t0 +rate = hsize(z0 / t) + +print("shrank: %0.2fs %11s/s %11s -> %11s %s" % + (t, rate, hsize(z0), hsize(z1), file_out)) diff --git a/scripts/shrink-logs.sh b/scripts/shrink-logs.sh new file mode 100755 index 00000000..40846f5a --- /dev/null +++ b/scripts/shrink-logs.sh @@ -0,0 +1,38 @@ +#!/bin/bash +set -eu + +# SHRINK LOGS SH +# Accepts a whole workflow output directory +# Clean up and shrink TensorFlow output logs +# See shrink-log.py for details +# Parallelizable via make + +THIS=$( realpath $( dirname $0 ) ) +SUPERVISOR=$( realpath $THIS/.. ) +export THIS + +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an output DIR (e.g., .../experiments/X042/out)!" \ + DIR - ${*} + +if ! which python 2>&1 > /dev/null +then + echo "shrink-logs.sh: Add python to PATH!" + exit 1 +fi + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +if ! [[ -d $DIR ]] +then + echo "Does not exist: $DIR" + exit 1 +fi + +# This is used inside the Makefile below: +export TMP_SHRINK=/tmp/$USER/shrink +mkdir -pv $TMP_SHRINK + +cd $DIR +nice -n 19 make -j 4 -f $THIS/shrink-log.mk diff --git a/spack/spack.yaml b/spack/spack.yaml index 7e7743f5..eac5b770 100644 --- a/spack/spack.yaml +++ b/spack/spack.yaml @@ -5,18 +5,18 @@ spack: # add package specs to the `specs` list specs: - - py-keras ^py-theano+gpu - - py-scikit-learn - - py-pandas - - py-requests - - py-mdanalysis ^py-matplotlib@:2.2.3+image - - r-mlrmbo ^r-plotly@4.5.6 - - r-rgenoud - - r-dicekriging - - r-randomforest - - r-jsonlite - - stc@develop ^turbine@develop+python+r - - eqr + - py-keras ^py-theano+gpu + - py-scikit-learn + - py-pandas + - py-requests + - py-mdanalysis ^py-matplotlib@:2.2.3+image + - r-mlrmbo ^r-plotly@4.5.6 + - r-rgenoud + - r-dicekriging + - r-randomforest + - r-jsonlite + - stc@develop ^turbine@develop+python+r + - eqr config: {} mirrors: {} modules: @@ -25,12 +25,17 @@ spack: packages: all: providers: - mpi: [mvapich2@2.3 arch=linux-rhel7-x86_64, spectrum-mpi@rolling-release arch=linux-rhel7-ppc64le] + mpi: + [ + mvapich2@2.3 arch=linux-rhel7-x86_64, + spectrum-mpi@rolling-release arch=linux-rhel7-ppc64le, + ] buildable: true version: [] paths: {} modules: {} - compiler: [gcc@7.3.0 arch=linux-rhel7-x86_64, gcc@7.3.1 arch=linux-rhel7-ppc64le] + compiler: + [gcc@7.3.0 arch=linux-rhel7-x86_64, gcc@7.3.1 arch=linux-rhel7-ppc64le] python: buildable: true version: [3.7.2] diff --git a/workflows/GA/README.md b/workflows/GA/README.md index db42def1..bac4e162 100644 --- a/workflows/GA/README.md +++ b/workflows/GA/README.md @@ -1,34 +1,33 @@ -# GA (genetic algorithm) based based hyperparameter optimization on CANDLE Benchmarks # +# GA (genetic algorithm) based based hyperparameter optimization on CANDLE Benchmarks -The GA workflow uses the Python deap package (http://deap.readthedocs.io/en/master) to optimize hyperparameters using a genetic algorithm. +The GA workflow uses the Python deap package (http://deap.readthedocs.io/en/master) to optimize hyperparameters using a genetic algorithm. -## Running ## +## Running 1. cd into the **Supervisor/workflows/GA/test** directory 2. Specify the GA parameters in the **cfg-prm-1.sh** file (see [below](#structure) for more information on the GA parameters) 3. Specify the PROCS, QUEUE etc. in **cfg-sys-1.sh** file 4. You will pass the MODEL_NAME, SITE, and optional experiment id arguments to **test-1.sh** file when launching: -`./test-1.sh [expid]` -where `model_name` can be tc1 etc., `machine_name` can be local, cori, theta, titan etc. (see [NOTE](#making_changes) below on creating new SITE files.) + `./test-1.sh [expid]` + where `model_name` can be tc1 etc., `machine_name` can be local, cori, theta, titan etc. (see [NOTE](#making_changes) below on creating new SITE files.) 5. Update the parameter space json file if necessary. The parameter space is defined in json file (see **workflows/GA/data/tc1_param_space_ga.json** for an example with tc1). The -**cfg-prm-1.sh** script will attempt to select the correct json given the model name. Edit that file as appropriate. The parameter space json file is further described [here](#config) + **cfg-prm-1.sh** script will attempt to select the correct json given the model name. Edit that file as appropriate. The parameter space json file is further described [here](#config) 6. The benchmark will be run for the number of processors specified 7. Final objective function values, along with parameters, will be available in the experiments directory in a **finals_results** file and also printed to standard out. - -## User requirements ## +## User requirements What you need to install to run the workflow: -* This workflow - `git@github.com:ECP-CANDLE/Supervisor.git` . +- This workflow - `git@github.com:ECP-CANDLE/Supervisor.git` . Clone and switch to the `master` branch. Then `cd` to `workflows/GA` (the directory containing this README). -* TC1 or other benchmark - `git@github.com:ECP-CANDLE/Benchmarks.git` . +- TC1 or other benchmark - `git@github.com:ECP-CANDLE/Benchmarks.git` . Clone and switch to the `frameworks` branch. -* benchmark data - - See the individual benchmarks README for obtaining the initial data +- benchmark data - + See the individual benchmarks README for obtaining the initial data - Python specific installation requirements: +Python specific installation requirements: 1. pandas 2. deap @@ -41,10 +40,10 @@ directory to the PYTHONPATH specified in **cfg-sys-1.sh**. For example, `export PYTHONPATH=/global/u1/n/ncollier/.local/cori/deeplearning2.7/lib/python2.7/site-packages` - -## Calling sequence ## +## Calling sequence Function calls: + ``` test-1.sh -> swift/workflow.sh -> @@ -61,13 +60,16 @@ test-1.sh -> swift/workflow.sh -> ``` Scheduling scripts: + ``` test-1.sh -> cfg-sys-1.sh -> common/sh/ - module, scheduling, langs .sh files ``` + ## Making Changes To create your own SITE files in workflows/common/sh/: + - langs-SITE.sh - langs-app-SITE.sh - modules-SITE.sh @@ -77,17 +79,18 @@ copy existing ones but modify the langs-SITE.sh file to define the EQPy location ### Structure ### -The point of the script structure is that it is easy to make copy and modify the `test-*.sh` script, and the `cfg-*.sh` scripts. These can be checked back -into the repo for use by others. The `test-*.sh` script and the `cfg-*.sh` scripts should simply contain environment variables that control how `workflow.sh` +The point of the script structure is that it is easy to make copy and modify the `test-*.sh` script, and the `cfg-*.sh` scripts. These can be checked back +into the repo for use by others. The `test-*.sh` script and the `cfg-*.sh` scripts should simply contain environment variables that control how `workflow.sh` and `workflow.swift` operate. `test-1.sh` and `cfg-{sys,prm}-1.sh` should be unmodified for simple testing. The relevant parameters for the GA algorithm are defined in `cfg-prm-*.sh` scripts (see example in `cfg-prm-1.sh`). These are: + - SEED: The random seed used by deap in the GA. - NUM_ITERATIONS: The number of iterations the GA should perform. -- POPULATION_SIZE: The maximum number of hyperparameter sets to evaluate in each iteration. -GA_STRATEGY: The algorithm used by the GA. Can be one of "simple" or "mu_plus_lambda". See eaSimple and eaMuPlusLambda at https://deap.readthedocs.io/en/master/api/algo.html?highlight=eaSimple#module-deap.algorithms for more information. +- POPULATION_SIZE: The maximum number of hyperparameter sets to evaluate in each iteration. + GA_STRATEGY: The algorithm used by the GA. Can be one of "simple" or "mu_plus_lambda". See eaSimple and eaMuPlusLambda at https://deap.readthedocs.io/en/master/api/algo.html?highlight=eaSimple#module-deap.algorithms for more information. ### Hyperparameter Configuration File ### @@ -96,95 +99,109 @@ The GA workflow uses a json format file for defining the hyperparameter space. T The hyperparameter configuration file has a json format consisting of a list of json dictionaries, each one of which defines a hyperparameter. Each dictionary has the following required keys: -* name: the name of the hyperparameter (e.g. *epochs*) -* type: determines how the initial population (i.e. the hyperparameter sets) are initialized from the named parameter and how those values are subsequently mutated by the GA. Type is one of `constant`, `int`, `float`, `logical`, `categorical`, or `ordered`. - * `constant`: - * each model is initialized with the same specifed value - * mutation always returns the same specified value - * `int`: - * each model is initialized with an int randomly drawn from the range defined by `lower` and `upper` bounds - * mutation is peformed by adding the results of a random draw from - a gaussian distribution to the current value, where the gaussian distribution's mu is 0 and its sigma is specified by the `sigma` entry. - * `float`: - * each model is initialized with a float randomly drawn from the range defined by `lower` and `upper` bounds - * mutation is peformed by adding the results of a random draw from - a gaussian distribution to the current value, where the gaussian distribution's mu is 0 and its sigma is specified by the `sigma` entry. - * `logical`: - * each model is initialized with a random boolean. - * mutation flips the logical value - * `categorical`: - * each model is initialized with an element chosen at random from the list of elements in `values`. - * mutation chooses an element from the `values` list at random - * `ordered`: - * each model is inititalized with an element chosen at random from the list of elements in `values`. - * given the index of the current value in the list of `values`, mutation selects the element *n* number of indices away, where n is the result of a random draw between 1 and `sigma` and then is negated with a 0.5 probability. +- name: the name of the hyperparameter (e.g. _epochs_) +- type: determines how the initial population (i.e. the hyperparameter sets) are initialized from the named parameter and how those values are subsequently mutated by the GA. Type is one of `constant`, `int`, `float`, `logical`, `categorical`, or `ordered`. + - `constant`: + - each model is initialized with the same specifed value + - mutation always returns the same specified value + - `int`: + - each model is initialized with an int randomly drawn from the range defined by `lower` and `upper` bounds + - mutation is peformed by adding the results of a random draw from + a gaussian distribution to the current value, where the gaussian distribution's mu is 0 and its sigma is specified by the `sigma` entry. + - `float`: + - each model is initialized with a float randomly drawn from the range defined by `lower` and `upper` bounds + - mutation is peformed by adding the results of a random draw from + a gaussian distribution to the current value, where the gaussian distribution's mu is 0 and its sigma is specified by the `sigma` entry. + - `logical`: + - each model is initialized with a random boolean. + - mutation flips the logical value + - `categorical`: + - each model is initialized with an element chosen at random from the list of elements in `values`. + - mutation chooses an element from the `values` list at random + - `ordered`: + - each model is inititalized with an element chosen at random from the list of elements in `values`. + - given the index of the current value in the list of `values`, mutation selects the element _n_ number of indices away, where n is the result of a random draw between 1 and `sigma` and then is negated with a 0.5 probability. The following keys are required depending on value of the `type` key. If the `type` is `constant`: - * `value`: the constant value + +- `value`: the constant value If the `type` is `int`, or `float`: - * `lower`: the lower bound of the range to draw from - * `upper`: the upper bound of the range to draw from - * `sigma`: the sigma value used by the mutation operator (see above). + +- `lower`: the lower bound of the range to draw from +- `upper`: the upper bound of the range to draw from +- `sigma`: the sigma value used by the mutation operator (see above). If the `type` is `categorical`: - * `values`: the list of elements to choose from - * `element_type`: the type of the elements to choose from. One of `int`, `float`, `string`, or `logical` + +- `values`: the list of elements to choose from +- `element_type`: the type of the elements to choose from. One of `int`, `float`, `string`, or `logical` If the `type` is `ordered`: - * `values`: the list of elements to choose from - * `element_type`: the type of the elements to choose from. One of `int`, `float`, `string`, or `logical` - * `sigma`: the sigma value used by the mutation operator (see above). + +- `values`: the list of elements to choose from +- `element_type`: the type of the elements to choose from. One of `int`, `float`, `string`, or `logical` +- `sigma`: the sigma value used by the mutation operator (see above). A sample hyperparameter definition file: ```javascript [ { - "name": "activation", - "type": "categorical", - "element_type": "string", - "values": ["softmax", "elu", "softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear"] + name: "activation", + type: "categorical", + element_type: "string", + values: [ + "softmax", + "elu", + "softplus", + "softsign", + "relu", + "tanh", + "sigmoid", + "hard_sigmoid", + "linear", + ], }, { - "name": "optimizer", - "type": "categorical", - "element_type": "string", - "values": ["adam", "rmsprop"] + name: "optimizer", + type: "categorical", + element_type: "string", + values: ["adam", "rmsprop"], }, { - "name": "lr", - "type": "float", - "lower": 0.0001, - "upper": 0.01, - "sigma": "0.000495" + name: "lr", + type: "float", + lower: 0.0001, + upper: 0.01, + sigma: "0.000495", }, { - "name": "batch_size", - "type": "ordered", - "element_type": "int", - "values": [16, 32, 64, 128, 256], - "sigma": 1 - } -] + name: "batch_size", + type: "ordered", + element_type: "int", + values: [16, 32, 64, 128, 256], + sigma: 1, + }, +]; ``` Note that any other keys are ignored by the workflow but can be used to add additional information about the hyperparameter. For example, the sample files contain a `comment` entry that contains additional information about that hyperparameter. -### Where to check for output ### +### Where to check for output This includes error output. -When you run the test script, you will get a message about `TURBINE_OUTPUT` . This will be the main output directory for your run. +When you run the test script, you will get a message about `TURBINE_OUTPUT` . This will be the main output directory for your run. -* On a local system, stdout/stderr for the workflow will go to your terminal. -* On a scheduled system, stdout/stderr for the workflow will go to `TURBINE_OUTPUT/output.txt` +- On a local system, stdout/stderr for the workflow will go to your terminal. +- On a scheduled system, stdout/stderr for the workflow will go to `TURBINE_OUTPUT/output.txt` The individual objective function (model) runs stdout/stderr go into directories of the form: @@ -197,10 +214,7 @@ Each successful run of the workflow will produce a `final_results_2` file. The f - gen: the generation / iteration - nevals: the number of evaluations performed in this generation. In generations after the first, this may be less the total population size as some combinations will already have been evaluated. - avg: the average score -- std: the standard deviation +- std: the standard deviation - min: the minimum score - max: the maximum score - ts: a timestamp recording when this generation finished. The value is the number of seconds since the epoch in floating point format - - - diff --git a/workflows/GA/data/adrp_param_space_ga.json b/workflows/GA/data/adrp_param_space_ga.json index 10aa92fd..c0f7689d 100644 --- a/workflows/GA/data/adrp_param_space_ga.json +++ b/workflows/GA/data/adrp_param_space_ga.json @@ -4,48 +4,45 @@ "type": "categorical", "element_type": "string", "values": [ - "1000", - "1000 1000", - "1000 1000 1000", - "1000 1000 1000 1000", - "1000 1000 1000 1000 1000" + "500 250 125 60 30", + "250 125 60 30", + "400 150 75 30", + "300 175 90 45 20", + "400 200 100 50 25", + "350 170 85 40 20" ] }, - - { - "name": "dense_feature_layers", - "type": "categorical", - "element_type": "string", - "values": ["250 125 60 30", "500 250 125 60 30", "125 60 30"] - }, - { "name": "batch_size", "type": "ordered", "element_type": "int", - "values": [32, 64], + "values": [16, 32, 64], "sigma": 1 }, - { "name": "optimizer", "type": "categorical", "element_type": "string", - "values": ["adam", "sgd", "rmsprop"] - }, - - { - "name": "learning_rate", - "type": "float", - "lower": 0.00001, - "upper": 0.001, - "sigma": 0.0049995 + "values": ["adam", "sgd"] }, { "name": "epochs", "type": "int", - "lower": 2, - "upper": 2, + "lower": 200, + "upper": 200, "sigma": 20 + }, + { + "name": "dropout", + "type": "float", + "lower": 0.05, + "upper": 0.2, + "sigma": 0.045 + }, + { + "name": "activation", + "type": "categorical", + "element_type": "string", + "values": ["elu", "relu", "linear"] } ] diff --git a/workflows/GA/data/combo_param_space_ga.json b/workflows/GA/data/combo_param_space_ga.json index fd2139c4..a392d9df 100644 --- a/workflows/GA/data/combo_param_space_ga.json +++ b/workflows/GA/data/combo_param_space_ga.json @@ -16,22 +16,26 @@ "name": "dense", "type": "categorical", "element_type": "string", - "values": ["1000", - "1000 1000", - "1000 1000 1000", - "1000 1000 1000 1000", - "1000 1000 1000 1000 1000"] + "values": [ + "1000", + "1000 1000", + "1000 1000 1000", + "1000 1000 1000 1000", + "1000 1000 1000 1000 1000" + ] }, { "name": "dense_feature_layers", "type": "categorical", "element_type": "string", - "values": ["1000", - "1000 1000", - "1000 1000 1000", - "1000 1000 1000 1000", - "1000 1000 1000 1000 1000"] + "values": [ + "1000", + "1000 1000", + "1000 1000 1000", + "1000 1000 1000 1000", + "1000 1000 1000 1000 1000" + ] }, { @@ -90,16 +94,16 @@ { "name": "clipnorm", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 }, { "name": "clipvalue", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 }, @@ -107,7 +111,7 @@ "name": "decay", "type": "float", "lower": 0, - "upper": 1e01, + "upper": 1e1, "sigma": 0.5 }, @@ -122,8 +126,8 @@ { "name": "rho", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 }, @@ -131,7 +135,7 @@ "name": "momentum", "type": "float", "lower": 0, - "upper": 1e01, + "upper": 1e1, "sigma": 0.5 }, @@ -143,16 +147,16 @@ { "name": "beta_1", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 }, { "name": "beta_2", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 } ] diff --git a/workflows/GA/data/graphdrp_param_space.json b/workflows/GA/data/graphdrp_param_space.json new file mode 100644 index 00000000..0548c081 --- /dev/null +++ b/workflows/GA/data/graphdrp_param_space.json @@ -0,0 +1,23 @@ +[ + { + "name": "learning_rate", + "type": "float", + "lower": 0.000001, + "upper": 0.1, + "sigma": 0.01 + }, + + { + "name": "batch_size", + "type": "ordered", + "element_type": "int", + "values": [32, 64, 128, 256, 512, 1024, 2048], + "sigma": 1 + }, + + { + "name": "epochs", + "type": "constant", + "value": 5 + } +] diff --git a/workflows/GA/data/graphdrp_param_space_ga.json b/workflows/GA/data/graphdrp_param_space_ga.json new file mode 100644 index 00000000..ca3531a1 --- /dev/null +++ b/workflows/GA/data/graphdrp_param_space_ga.json @@ -0,0 +1,49 @@ +[ + { + + "name": "activation", + "type": "categorical", + "element_type": "string", + "values": [ + "softmax", + "elu", + "softplus", + "softsign", + "relu", + "tanh", + "sigmoid", + "hard_sigmoid", + "linear" + ] + }, + + { + "name": "optimizer", + "type": "categorical", + "element_type": "string", + "values": ["adam", "rmsprop"] + }, + + { + "name": "dropout", + "type": "float", + "lower": 0.0, + "upper": 0.9, + "sigma": 0.045 + }, + + + { + "name": "batch_size", + "type": "ordered", + "element_type": "int", + "values": [16, 32, 64, 128, 256], + "sigma": 1 + }, + + { + "name": "epochs", + "type": "constant", + "value": 5 + } +] diff --git a/workflows/GA/data/hidra_param_space.json b/workflows/GA/data/hidra_param_space.json new file mode 100644 index 00000000..5dbd942e --- /dev/null +++ b/workflows/GA/data/hidra_param_space.json @@ -0,0 +1,23 @@ +[ + { + "name": "learning_rate", + "type": "float", + "lower": 0.000001, + "upper": 0.1, + "sigma": 0.01 + }, + + { + "name": "batch_size", + "type": "ordered", + "element_type": "int", + "values": [16, 32, 64, 128, 256, 512, 1024, 2048], + "sigma": 1 + }, + + { + "name": "epochs", + "type": "constant", + "value": 5 + } +] diff --git a/workflows/GA/data/igtd_param_space_ga.json b/workflows/GA/data/igtd_param_space_ga.json new file mode 100644 index 00000000..2df67c3c --- /dev/null +++ b/workflows/GA/data/igtd_param_space_ga.json @@ -0,0 +1,22 @@ +[ + { + "name": "learning_rate", + "type": "float", + "lower": 0.00001, + "upper": 0.1, + "sigma": 0.0049995 + }, + { + "name": "batch_size", + "type": "ordered", + "element_type": "int", + "values": [16, 32, 64, 128, 256], + "sigma": 1 + }, + + { + "name": "epochs", + "type": "constant", + "value": 50 + } +] diff --git a/workflows/GA/data/nt3_param_space_ga.json b/workflows/GA/data/nt3_param_space_ga.json index 527d07b5..9c21f4f4 100644 --- a/workflows/GA/data/nt3_param_space_ga.json +++ b/workflows/GA/data/nt3_param_space_ga.json @@ -10,8 +10,8 @@ { "name": "epochs", "type": "int", - "lower": 100, - "upper": 500, + "lower": 1, + "upper": 5, "sigma": 20 }, @@ -19,18 +19,30 @@ "name": "activation", "type": "categorical", "element_type": "string", - "values": ["softmax", "elu", "softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear"] + "values": [ + "softmax", + "elu", + "softplus", + "softsign", + "relu", + "tanh", + "sigmoid", + "hard_sigmoid", + "linear" + ] }, { "name": "dense", "type": "categorical", "element_type": "string", - "values": ["500 100 50", - "1000 500 100 50", - "2000 1000 500 100 50", - "2000 1000 1000 500 100 50", - "2000 1000 1000 1000 500 100 50"] + "values": [ + "500 100 50", + "1000 500 100 50", + "2000 1000 500 100 50", + "2000 1000 1000 500 100 50", + "2000 1000 1000 1000 500 100 50" + ] }, { @@ -60,26 +72,28 @@ "name": "conv", "type": "categorical", "element_type": "string", - "values": ["50 50 50 50 50 1", - "25 25 25 25 25 1", - "64 32 16 32 64 1", - "100 100 100 100 100 1", - "32 20 16 32 10 1"] + "values": [ + "50 50 50 50 50 1", + "25 25 25 25 25 1", + "64 32 16 32 64 1", + "100 100 100 100 100 1", + "32 20 16 32 10 1" + ] }, { "name": "clipnorm", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 }, { "name": "clipvalue", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 }, @@ -87,53 +101,52 @@ "name": "decay", "type": "float", "lower": 0, - "upper": 1e01, + "upper": 1e1, "sigma": 0.5 }, + { + "name": "epsilon", + "type": "ordered", + "element_type": "float", + "values": [1e-6, 1e-8, 1e-10, 1e-12, 1e-14], + "sigma": 1 + }, + + { + "name": "rho", + "type": "float", + "lower": 1e-4, + "upper": 1e1, + "sigma": 0.499995 + }, + + { + "name": "momentum", + "type": "float", + "lower": 0, + "upper": 1e1, + "sigma": 0.5 + }, - { - "name": "epsilon", - "type": "ordered", - "element_type": "float", - "values": [1e-6, 1e-8, 1e-10, 1e-12, 1e-14], - "sigma": 1 - }, - - { - "name": "rho", - "type": "float", - "lower": 1e-04, - "upper": 1e01, - "sigma": 0.499995 - }, - - { - "name": "momentum", - "type": "float", - "lower": 0, - "upper": 1e01, - "sigma": 0.5 - }, - - { - "name": "nesterov", - "type": "logical" - }, - - { - "name": "beta_1", - "type": "float", - "lower": 1e-04, - "upper": 1e01, - "sigma": 0.499995 - }, - - { - "name": "beta_2", - "type": "float", - "lower": 1e-04, - "upper": 1e01, - "sigma": 0.499995 - } + { + "name": "nesterov", + "type": "logical" + }, + + { + "name": "beta_1", + "type": "float", + "lower": 1e-4, + "upper": 1e1, + "sigma": 0.499995 + }, + + { + "name": "beta_2", + "type": "float", + "lower": 1e-4, + "upper": 1e1, + "sigma": 0.499995 + } ] diff --git a/workflows/GA/data/oned_param_space_ga.json b/workflows/GA/data/oned_param_space_ga.json new file mode 100644 index 00000000..9c21d17a --- /dev/null +++ b/workflows/GA/data/oned_param_space_ga.json @@ -0,0 +1,9 @@ +[ + { + "name": "x", + "type": "float", + "lower": 0.0, + "upper": 20.0, + "sigma": 0.1 + } +] diff --git a/workflows/GA/data/p1b1_param_space_ga.json b/workflows/GA/data/p1b1_param_space_ga.json index eae9684b..1def4610 100644 --- a/workflows/GA/data/p1b1_param_space_ga.json +++ b/workflows/GA/data/p1b1_param_space_ga.json @@ -31,12 +31,14 @@ "name": "dense", "type": "categorical", "element_type": "string", - "values": ["1500 500", - "978 978", - "978 978 978", - "978 978 978 978", - "978 978 978 978 978", - "978 978 978 978 978 978"] + "values": [ + "1500 500", + "978 978", + "978 978 978", + "978 978 978 978", + "978 978 978 978 978", + "978 978 978 978 978 978" + ] }, { @@ -95,16 +97,16 @@ { "name": "clipnorm", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 }, { "name": "clipvalue", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 }, @@ -112,11 +114,10 @@ "name": "decay", "type": "float", "lower": 0, - "upper": 1e01, + "upper": 1e1, "sigma": 0.5 }, - { "name": "epsilon", "type": "ordered", @@ -128,8 +129,8 @@ { "name": "rho", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 }, @@ -137,7 +138,7 @@ "name": "momentum", "type": "float", "lower": 0, - "upper": 1e01, + "upper": 1e1, "sigma": 0.5 }, @@ -149,16 +150,16 @@ { "name": "beta_1", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 }, { "name": "beta_2", "type": "float", - "lower": 1e-04, - "upper": 1e01, + "lower": 1e-4, + "upper": 1e1, "sigma": 0.499995 } ] diff --git a/workflows/GA/data/paccmann_param_space.json b/workflows/GA/data/paccmann_param_space.json new file mode 100644 index 00000000..0548c081 --- /dev/null +++ b/workflows/GA/data/paccmann_param_space.json @@ -0,0 +1,23 @@ +[ + { + "name": "learning_rate", + "type": "float", + "lower": 0.000001, + "upper": 0.1, + "sigma": 0.01 + }, + + { + "name": "batch_size", + "type": "ordered", + "element_type": "int", + "values": [32, 64, 128, 256, 512, 1024, 2048], + "sigma": 1 + }, + + { + "name": "epochs", + "type": "constant", + "value": 5 + } +] diff --git a/workflows/GA/data/random_param_space.json b/workflows/GA/data/random_param_space.json new file mode 100644 index 00000000..79125122 --- /dev/null +++ b/workflows/GA/data/random_param_space.json @@ -0,0 +1,30 @@ +[ + { + "name": "learning_rate", + "type": "float", + "lower": 0.000001, + "upper": 0.1, + "sigma": 0.01 + }, + + { + "name": "batch_size", + "type": "ordered", + "element_type": "int", + "values": [32, 64, 128, 256, 512, 1024, 2048], + "sigma": 1 + }, + + { + "name": "epochs", + "type": "constant", + "value": 5 + }, + + { + "name": "crash_probability", + "type": "constant", + "value": 0.25 + } + +] diff --git a/workflows/GA/data/tc1_param_space_ga.json b/workflows/GA/data/tc1_param_space_ga.json index eb321669..ffa1a4d4 100644 --- a/workflows/GA/data/tc1_param_space_ga.json +++ b/workflows/GA/data/tc1_param_space_ga.json @@ -23,7 +23,17 @@ "name": "activation", "type": "categorical", "element_type": "string", - "values": ["softmax", "elu", "softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear"] + "values": [ + "softmax", + "elu", + "softplus", + "softsign", + "relu", + "tanh", + "sigmoid", + "hard_sigmoid", + "linear" + ] }, { @@ -62,5 +72,4 @@ "type": "constant", "value": 5 } - ] diff --git a/workflows/GA/data/tcnns_param_space.json b/workflows/GA/data/tcnns_param_space.json new file mode 100644 index 00000000..30db57db --- /dev/null +++ b/workflows/GA/data/tcnns_param_space.json @@ -0,0 +1,23 @@ +[ + { + "name": "learning_rate", + "type": "float", + "lower": 0.000001, + "upper": 0.1, + "sigma": 0.01 + }, + + { + "name": "batch_size", + "type": "ordered", + "element_type": "int", + "values": [32, 64, 128, 256, 512, 1024, 2048], + "sigma": 1 + }, + + { + "name": "epochs", + "type": "constant", + "value": 3 + } +] diff --git a/workflows/GA/swift/workflow.sh b/workflows/GA/swift/workflow.sh index ce414d5b..69f18d2f 100755 --- a/workflows/GA/swift/workflow.sh +++ b/workflows/GA/swift/workflow.sh @@ -8,15 +8,7 @@ set -eu # Autodetect this workflow directory export EMEWS_PROJECT_ROOT=$( cd $( dirname $0 )/.. ; /bin/pwd ) export WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. ; /bin/pwd ) -if [[ ! -d $EMEWS_PROJECT_ROOT/../../../Benchmarks ]] -then - echo "Could not find Benchmarks in: $EMEWS_PROJECT_ROOT/../../../Benchmarks" - exit 1 -fi -export BENCHMARKS_ROOT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd) -BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/TC1:$BENCHMARKS_ROOT/Pilot1/NT3:$BENCHMARKS_ROOT/Pilot1/P1B1:$BENCHMARKS_ROOT/Pilot1/Combo:$BENCHMARKS_ROOT/Pilot2/P2B1:$BENCHMARKS_ROOT/examples/ADRP export BENCHMARK_TIMEOUT -export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} SCRIPT_NAME=$(basename $0) @@ -32,40 +24,68 @@ source $WORKFLOWS_ROOT/common/sh/utils.sh usage() { - echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME" + echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME" \ + "[CANDLE_MODEL_TYPE] [CANDLE_IMAGE]" } -if (( ${#} != 5 )) +if (( ${#} != 7 )) && (( ${#} != 5 )) then usage exit 1 fi -if ! { - get_site $1 # Sets SITE - get_expid $2 # Sets EXPID - get_cfg_sys $3 - get_cfg_prm $4 - MODEL_NAME=$5 - } +if (( ${#} == 7 )) then + export CANDLE_MODEL_TYPE=$6 + export CANDLE_IMAGE=$7 +elif (( ${#} == 5 )) +then + CANDLE_MODEL_TYPE="BENCHMARKS" + CANDLE_IMAGE=NONE +else usage exit 1 fi -echo "Running "$MODEL_NAME "workflow" +TURBINE_OUTPUT="" +if [[ $CANDLE_MODEL_TYPE = "SINGULARITY" ]] +then + TURBINE_OUTPUT=$CANDLE_DATA_DIR/output + printf "Running mlrMBO workflow with model %s and image %s:%s\n" \ + $MODEL_NAME $CANDLE_MODEL_TYPE $CANDLE_IMAGE +fi + +get_site $1 # Sets SITE +get_expid $2 # Sets EXPID +get_cfg_sys $3 +get_cfg_prm $4 +MODEL_NAME=$5 source_site env $SITE -source_site sched $SITE +source_site sched $SITE -# Set PYTHONPATH for BENCHMARK related stuff EQPY=${EQPY:-$WORKFLOWS_ROOT/common/ext/EQ-Py} -PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$EQPY +# Set up PYTHONPATH for model +source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh -PYTHONPATH+=:$WORKFLOWS_ROOT/common/python/ +# Set PYTHONPATH for BENCHMARK related stuff +PYTHONPATH+=:$EQPY +PYTHONPATH+=:$WORKFLOWS_ROOT/common/python + +export TURBINE_JOBNAME=$EXPID +RESTART_FILE_ARG="" +if [[ ${RESTART_FILE:-} != "" ]] +then + RESTART_FILE_ARG="--restart_file=$RESTART_FILE" +fi + +RESTART_NUMBER_ARG="" +if [[ ${RESTART_NUMBER:-} != "" ]] +then + RESTART_NUMBER_ARG="--restart_number=$RESTART_NUMBER" +fi -export TURBINE_JOBNAME="JOB:${EXPID}" CMD_LINE_ARGS=( -ga_params=$PARAM_SET_FILE -seed=$SEED -ni=$NUM_ITERATIONS @@ -91,14 +111,16 @@ log_script #copy the configuration files to TURBINE_OUTPUT cp $WORKFLOWS_ROOT/common/python/$GA_FILE $PARAM_SET_FILE $INIT_PARAMS_FILE $CFG_SYS $CFG_PRM $TURBINE_OUTPUT - # Make run directory in advance to reduce contention mkdir -pv $TURBINE_OUTPUT/run -# Allow the user to set an objective function -OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} -# This is used by the obj_app objective function +if [[ ${CANDLE_MODEL_TYPE:-} == "SINGULARITY" ]] +then + CANDLE_MODEL_IMPL="container" +fi +SWIFT_LIBS_DIR=${SWIFT_LIBS_DIR:-$WORKFLOWS_ROOT/common/swift} +SWIFT_MODULE=${SWIFT_MODULE:-model_$CANDLE_MODEL_IMPL} +# This is used by the candle_model_train_app function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh WAIT_ARG="" @@ -108,28 +130,80 @@ then echo "Turbine will wait for job completion." fi -# echo's anything following this to standard out - -swift-t -n $PROCS \ - ${MACHINE:-} \ - -p -I $EQPY -r $EQPY \ - -I $OBJ_DIR \ - -i $OBJ_MODULE \ - -e LD_LIBRARY_PATH=$LD_LIBRARY_PATH \ - -e TURBINE_RESIDENT_WORK_WORKERS=$TURBINE_RESIDENT_WORK_WORKERS \ - -e RESIDENT_WORK_RANKS=$RESIDENT_WORK_RANKS \ - -e BENCHMARKS_ROOT \ - -e EMEWS_PROJECT_ROOT \ - $( python_envs ) \ - -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ - -e OBJ_RETURN \ - -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ - -e MODEL_SH \ - -e MODEL_NAME \ - -e SITE \ - -e BENCHMARK_TIMEOUT \ - -e SH_TIMEOUT \ - -e IGNORE_ERRORS \ - $WAIT_ARG \ - $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} - +# Handle %-escapes in TURBINE_STDOUT +if [ $SITE == "summit" ] || \ + [ $SITE == "biowulf" ] || \ + [ $SITE == "polaris" ] +then + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" +else + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%r.txt" +fi + +mkdir -pv $TURBINE_OUTPUT/out + +#swift-t -n $PROCS \ +# -o $TURBINE_OUTPUT/workflow.tic \ +if [[ ${MACHINE:-} == "" ]] +then + STDOUT=$TURBINE_OUTPUT/output.txt + # The turbine-output link is only created on scheduled systems, + # so if running locally, we create it here so the test*.sh wrappers + # can find it + [[ -L turbine-output ]] && rm turbine-output + ln -s $TURBINE_OUTPUT turbine-output +else + # When running on a scheduled system, Swift/T automatically redirects + # stdout to the turbine-output directory. This will just be for + # warnings or unusual messages + STDOUT="" +fi + +if [[ ${CANDLE_DATA_DIR:-} == "" ]] +then + echo "CANDLE_DATA_DIR is not set in the environment! Exiting..." + exit 1 +fi + +( + which python swift-t + swift-t -O 0 -n $PROCS \ + ${MACHINE:-} \ + -p -I $EQPY -r $EQPY \ + -I $SWIFT_LIBS_DIR \ + -i $SWIFT_MODULE \ + -e LD_LIBRARY_PATH=$LD_LIBRARY_PATH \ + -e TURBINE_RESIDENT_WORK_WORKERS=$TURBINE_RESIDENT_WORK_WORKERS \ + -e RESIDENT_WORK_RANKS=$RESIDENT_WORK_RANKS \ + -e BENCHMARKS_ROOT \ + -e EMEWS_PROJECT_ROOT \ + $( python_envs ) \ + -e APP_PYTHONPATH \ + -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ + -e MODEL_RETURN \ + -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ + -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ + -e MODEL_SH \ + -e MODEL_NAME \ + -e SITE \ + -e BENCHMARK_TIMEOUT \ + -e SH_TIMEOUT \ + -e TURBINE_STDOUT \ + -e IGNORE_ERRORS \ + -e CANDLE_DATA_DIR \ + -e CANDLE_MODEL_TYPE \ + -e CANDLE_IMAGE \ + $WAIT_ARG \ + $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} +) + +if (( ${PIPESTATUS[0]} )) +then + echo "workflow.sh: swift-t exited with error!" + exit 1 +fi + +echo "EXIT CODE: 0" | tee -a $STDOUT + +# Andrew: Needed this so that script to monitor job worked properly (queue_wait... function in utils.sh?) +echo $TURBINE_OUTPUT > turbine-directory.txt diff --git a/workflows/GA/swift/workflow.swift b/workflows/GA/swift/workflow.swift index 0d4836ff..a64d7f1f 100644 --- a/workflows/GA/swift/workflow.swift +++ b/workflows/GA/swift/workflow.swift @@ -3,6 +3,7 @@ * WORKFLOW.SWIFT */ +import assert; import io; import sys; import files; @@ -10,10 +11,12 @@ import location; import string; import unix; import EQPy; -import R; import assert; import python; +import candle_utils; +report_env(); + string emews_root = getenv("EMEWS_PROJECT_ROOT"); string turbine_output = getenv("TURBINE_OUTPUT"); string resident_work_ranks = getenv("RESIDENT_WORK_RANKS"); @@ -21,12 +24,16 @@ string r_ranks[] = split(resident_work_ranks,","); string strategy = argv("strategy"); string ga_params_file = argv("ga_params"); -string init_params_file = argv("init_params", ""); +// string init_params_file = argv("init_params", ""); float mut_prob = string2float(argv("mutation_prob", "0.2")); string exp_id = argv("exp_id"); int benchmark_timeout = toint(argv("benchmark_timeout", "-1")); string model_name = getenv("MODEL_NAME"); +string candle_model_type = getenv("CANDLE_MODEL_TYPE"); +string candle_image = getenv("CANDLE_IMAGE"); +string init_params_file = getenv("INIT_PARAMS_FILE"); + printf("TURBINE_OUTPUT: " + turbine_output); string restart_number = argv("restart_number", "1"); @@ -60,13 +67,14 @@ string FRAMEWORK = "keras"; } else if (params == "EQPY_ABORT") { - printf("EQPy Aborted"); + printf("EQPy aborted..."); string why = EQPy_get(ME); // TODO handle the abort if necessary // e.g. write intermediate results ... printf("%s", why) => v = propagate() => - c = false; + c = false => + assert(false, "EQPY aborted!"); } else { @@ -74,7 +82,7 @@ string FRAMEWORK = "keras"; string results[]; foreach param, j in param_array { - results[j] = obj(param, "%00i_%000i_%0000i" % (restart_number,i,j)); + results[j] = candle_model_train(param, exp_id, "%00i_%000i_%0000i" % (restart_number,i,j), model_name); } string res = join(results, ";"); // printf(res); @@ -88,7 +96,7 @@ string FRAMEWORK = "keras"; // (num_iter, num_pop, seed, strategy, mut_prob, ga_params_file) algo_params = "%d,%d,%d,'%s',%f, '%s', '%s'" % (iters, pop, seed, strategy, mut_prob, ga_params_file, init_params_file); - EQPy_init_package(ME,"deap_ga") => + EQPy_init_package(ME, "deap_ga") => EQPy_get(ME) => EQPy_put(ME, algo_params) => loop(ME, ME_rank) => { @@ -97,22 +105,17 @@ string FRAMEWORK = "keras"; } } -main() { +main { assert(strlen(emews_root) > 0, "Set EMEWS_PROJECT_ROOT!"); - int random_seed = toint(argv("seed", "0")); - int num_iter = toint(argv("ni","100")); // -ni=100 - int num_pop = toint(argv("np","100")); // -np=100; - - //printf("NI: %i # num_iter", num_iter); - //printf("NV: %i # num_variations", num_variations); - //printf("NP: %i # num_pop", num_pop); - //printf("MUTPB: %f # mut_prob", mut_prob); + int random_seed = string2int(argv("seed", "0")); + int num_iter = string2int(argv("ni","100")); + int num_pop = string2int(argv("np","100")); int ME_ranks[]; foreach r_rank, i in r_ranks{ - ME_ranks[i] = toint(r_rank); + ME_ranks[i] = string2int(r_rank); } foreach ME_rank, i in ME_ranks { diff --git a/workflows/GA/test/cfg-prm-1.sh b/workflows/GA/test/cfg-prm-1.sh index 0e5ccbbe..52e1bdb5 100644 --- a/workflows/GA/test/cfg-prm-1.sh +++ b/workflows/GA/test/cfg-prm-1.sh @@ -4,14 +4,19 @@ SEED=${SEED:-1} # Total iterations -NUM_ITERATIONS=${NUM_ITERATIONS:-2} -# Size of GA population (i.e. the number of parameter sets to evaluate) -POPULATION_SIZE=${POPULATION_SIZE:-4} -# the GA strategy: one of 'simple' or 'mu_plus_lambda'. See -# https://deap.readthedocs.io/en/master/api/algo.html?highlight=eaSimple#module-deap.algorithms +NUM_ITERATIONS=${NUM_ITERATIONS:-5} +# Size of GA population +# (i.e. the number of parameter sets to evaluate per iteration) +POPULATION_SIZE=${POPULATION_SIZE:-8} +# the GA strategy: one of 'simple' or 'mu_plus_lambda'. See +# https://deap.readthedocs.io/en/master/api/algo.html?highlight=eaSimple#module-deap.algorithms # for more info. GA_STRATEGY=${STRATEGY:-simple} +# Set IGNORE_ERRORS=1 to ignore model errors and +# allow NaNs in model results: +# export IGNORE_ERRORS=1 + INIT_PARAMS_FILE=${INIT_PARAMS_FILE:-} # TODO: move the following code to a utility library- @@ -21,12 +26,14 @@ if [ "$MODEL_NAME" = "combo" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/combo_param_space_ga.json} elif [ "$MODEL_NAME" = "p1b1" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b1_param_space_ga.json} -elif [ "$MODEL_NAME" = "adrp" ]; then - PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/adrp_param_space_ga.json} elif [ "$MODEL_NAME" = "nt3" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/nt3_param_space_ga.json} +elif [ "$MODEL_NAME" = "graphdrp" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/graphdrp_param_space_ga.json} elif [ "$MODEL_NAME" = "tc1" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/tc1_param_space_ga.json} +elif [ "$MODEL_NAME" = "oned" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/oned_param_space_ga.json} # TODO: Uncomment when parameter files are available # elif [ "$MODEL_NAME" = "p1b3" ]; then # PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b3_param_space_ga.json} @@ -34,7 +41,7 @@ elif [ "$MODEL_NAME" = "tc1" ]; then # PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b2_param_space_ga.json} # elif [ "$MODEL_NAME" = "p2b1" ]; then # PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p2b1_param_space_ga.json} -elif [ "$PARAM_SET_FILE" != "" ]; then +elif [ "${PARAM_SET_FILE:-}" != "" ]; then PARAM_SET_FILE=${EMEWS_PROJECT_ROOT}/data/${PARAM_SET_FILE} else echo "Invalid model-" $MODEL_NAME diff --git a/workflows/GA/test/cfg-prm-paccmann-1.sh b/workflows/GA/test/cfg-prm-paccmann-1.sh new file mode 100644 index 00000000..0bb76e92 --- /dev/null +++ b/workflows/GA/test/cfg-prm-paccmann-1.sh @@ -0,0 +1,48 @@ +# CFG PRM PACCMAN 1 +# GA settings + +SEED=${SEED:-1} +# Total iterations +NUM_ITERATIONS=${NUM_ITERATIONS:-3} +# Size of GA population +# (i.e. the number of parameter sets to evaluate per iteration) +POP_DEFAULT=$(( PROCS - 2 )) +POPULATION_SIZE=${POPULATION_SIZE:-${POP_DEFAULT}} +# the GA strategy: one of 'simple' or 'mu_plus_lambda'. See +# https://deap.readthedocs.io/en/master/api/algo.html?highlight=eaSimple#module-deap.algorithms +# for more info. +GA_STRATEGY=${STRATEGY:-simple} + +# Polaris: +# Run HiDRA on 10 nodes for 3 hours for 20 epochs + +INIT_PARAMS_FILE=${INIT_PARAMS_FILE:-} + +# TODO: move the following code to a utility library- +# this is a configuration file +# Set the ga parameter space definition file for running +if [ "$MODEL_NAME" = "combo" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/combo_param_space_ga.json} +elif [ "$MODEL_NAME" = "p1b1" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b1_param_space_ga.json} +elif [ "$MODEL_NAME" = "nt3" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/nt3_param_space_ga.json} +elif [ "$MODEL_NAME" = "graphdrp" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/graphdrp_param_space_ga.json} +elif [ "$MODEL_NAME" = "tc1" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/tc1_param_space_ga.json} +elif [ "$MODEL_NAME" = "oned" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/oned_param_space_ga.json} +# TODO: Uncomment when parameter files are available +# elif [ "$MODEL_NAME" = "p1b3" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b3_param_space_ga.json} +# elif [ "$MODEL_NAME" = "p1b2" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b2_param_space_ga.json} +# elif [ "$MODEL_NAME" = "p2b1" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p2b1_param_space_ga.json} +elif [ "${PARAM_SET_FILE:-}" != "" ]; then + PARAM_SET_FILE=${EMEWS_PROJECT_ROOT}/data/${PARAM_SET_FILE} +else + echo "Invalid model-" $MODEL_NAME + exit 1 +fi diff --git a/workflows/GA/test/cfg-prm-polaris.sh b/workflows/GA/test/cfg-prm-polaris.sh new file mode 100644 index 00000000..28862053 --- /dev/null +++ b/workflows/GA/test/cfg-prm-polaris.sh @@ -0,0 +1,42 @@ +# CFG PRM 1 + +# GA settings + +SEED=${SEED:-1} +# Total iterations +NUM_ITERATIONS=${NUM_ITERATIONS:-5} +# Size of GA population (i.e. the number of parameter sets to evaluate) +POPULATION_SIZE=${POPULATION_SIZE:-9} +# the GA strategy: one of 'simple' or 'mu_plus_lambda'. See +# https://deap.readthedocs.io/en/master/api/algo.html?highlight=eaSimple#module-deap.algorithms +# for more info. +GA_STRATEGY=${STRATEGY:-simple} + +INIT_PARAMS_FILE=${INIT_PARAMS_FILE:-} + +# TODO: move the following code to a utility library- +# this is a configuration file +# Set the ga parameter space definition file for running +if [ "$MODEL_NAME" = "combo" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/combo_param_space_ga.json} +elif [ "$MODEL_NAME" = "p1b1" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b1_param_space_ga.json} +elif [ "$MODEL_NAME" = "nt3" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/nt3_param_space_ga.json} +elif [ "$MODEL_NAME" = "graphdrp" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/graphdrp_param_space_ga.json} +elif [ "$MODEL_NAME" = "tc1" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/tc1_param_space_ga.json} +# TODO: Uncomment when parameter files are available +# elif [ "$MODEL_NAME" = "p1b3" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b3_param_space_ga.json} +# elif [ "$MODEL_NAME" = "p1b2" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b2_param_space_ga.json} +# elif [ "$MODEL_NAME" = "p2b1" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p2b1_param_space_ga.json} +elif [ "$PARAM_SET_FILE" != "" ]; then + PARAM_SET_FILE=${EMEWS_PROJECT_ROOT}/data/${PARAM_SET_FILE} +else + echo "Invalid model-" $MODEL_NAME + exit 1 +fi diff --git a/workflows/GA/test/cfg-prm-summit.sh b/workflows/GA/test/cfg-prm-summit.sh new file mode 100644 index 00000000..04981a4f --- /dev/null +++ b/workflows/GA/test/cfg-prm-summit.sh @@ -0,0 +1,42 @@ +# CFG PRM 1 + +# GA settings + +SEED=${SEED:-1} +# Total iterations +NUM_ITERATIONS=${NUM_ITERATIONS:-1} +# Size of GA population (i.e. the number of parameter sets to evaluate) +POPULATION_SIZE=${POPULATION_SIZE:-274} +# the GA strategy: one of 'simple' or 'mu_plus_lambda'. See +# https://deap.readthedocs.io/en/master/api/algo.html?highlight=eaSimple#module-deap.algorithms +# for more info. +GA_STRATEGY=${STRATEGY:-simple} + +INIT_PARAMS_FILE=${INIT_PARAMS_FILE:-} + +# TODO: move the following code to a utility library- +# this is a configuration file +# Set the ga parameter space definition file for running +if [ "$MODEL_NAME" = "combo" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/combo_param_space_ga.json} +elif [ "$MODEL_NAME" = "p1b1" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b1_param_space_ga.json} +elif [ "$MODEL_NAME" = "adrp" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/adrp_param_space_ga.json} +elif [ "$MODEL_NAME" = "nt3" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/nt3_param_space_ga.json} +elif [ "$MODEL_NAME" = "tc1" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/tc1_param_space_ga.json} +# TODO: Uncomment when parameter files are available +# elif [ "$MODEL_NAME" = "p1b3" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b3_param_space_ga.json} +# elif [ "$MODEL_NAME" = "p1b2" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b2_param_space_ga.json} +# elif [ "$MODEL_NAME" = "p2b1" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p2b1_param_space_ga.json} +elif [ "$PARAM_SET_FILE" != "" ]; then + PARAM_SET_FILE=${EMEWS_PROJECT_ROOT}/data/${PARAM_SET_FILE} +else + echo "Invalid model-" $MODEL_NAME + exit 1 +fi diff --git a/workflows/GA/test/cfg-prm-tcnns-1.sh b/workflows/GA/test/cfg-prm-tcnns-1.sh new file mode 100644 index 00000000..caaaf197 --- /dev/null +++ b/workflows/GA/test/cfg-prm-tcnns-1.sh @@ -0,0 +1,45 @@ +# CFG PRM tCNNS 1 + +SEED=${SEED:-1} +# Total iterations +NUM_ITERATIONS=${NUM_ITERATIONS:-3} +# Size of GA population +# (i.e. the number of parameter sets to evaluate per iteration) +POP_DEFAULT=$(( PROCS - 2 )) +POPULATION_SIZE=${POPULATION_SIZE:-${POP_DEFAULT}} +# the GA strategy: one of 'simple' or 'mu_plus_lambda'. See +# https://deap.readthedocs.io/en/master/api/algo.html?highlight=eaSimple#module-deap.algorithms +# for more info. +GA_STRATEGY=${STRATEGY:-simple} +export IGNORE_ERRORS=1 + +INIT_PARAMS_FILE=${INIT_PARAMS_FILE:-} + +# TODO: move the following code to a utility library- +# this is a configuration file +# Set the ga parameter space definition file for running +if [ "$MODEL_NAME" = "combo" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/combo_param_space_ga.json} +elif [ "$MODEL_NAME" = "p1b1" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b1_param_space_ga.json} +elif [ "$MODEL_NAME" = "nt3" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/nt3_param_space_ga.json} +elif [ "$MODEL_NAME" = "graphdrp" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/graphdrp_param_space_ga.json} +elif [ "$MODEL_NAME" = "tc1" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/tc1_param_space_ga.json} +elif [ "$MODEL_NAME" = "oned" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/oned_param_space_ga.json} +# TODO: Uncomment when parameter files are available +# elif [ "$MODEL_NAME" = "p1b3" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b3_param_space_ga.json} +# elif [ "$MODEL_NAME" = "p1b2" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b2_param_space_ga.json} +# elif [ "$MODEL_NAME" = "p2b1" ]; then +# PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p2b1_param_space_ga.json} +elif [ "${PARAM_SET_FILE:-}" != "" ]; then + PARAM_SET_FILE=${EMEWS_PROJECT_ROOT}/data/${PARAM_SET_FILE} +else + echo "Invalid model-" $MODEL_NAME + exit 1 +fi diff --git a/workflows/GA/test/cfg-sys-1.sh b/workflows/GA/test/cfg-sys-1.sh index 27bd5ac5..6192ca61 100644 --- a/workflows/GA/test/cfg-sys-1.sh +++ b/workflows/GA/test/cfg-sys-1.sh @@ -1,19 +1,24 @@ # -# COMBO CFG SYS 1 +# GA CFG SYS 1 # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEMS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-4} +export PROCS=${PROCS:-8} # MPI processes per node # Cori has 32 cores per node, 128GB per node -export PPN=${PPN:-1} +export PPN=${PPN:-8} export WALLTIME=${WALLTIME:-01:00:00} #export PROJECT=Candle_ECP +export PROJECT=candle_aesp +# export QUEUE="debug" # Up to 2 nodes +# export QUEUE="debug-scaling" # Up to 10 nodes +export QUEUE="prod" # At least 10 nodes + # Benchmark run timeout: benchmark run will timeout # after the specified number of seconds. # If set to -1 there is no timeout. @@ -38,6 +43,9 @@ export IGNORE_ERRORS=0 # it may be ncessary to include its location in the PYTHONPATH # export PYTHONPATH=/global/u1/n/ncollier/.local/cori/deeplearning2.7/lib/python2.7/site-packages +export TURBINE_RESIDENT_WORK_WORKERS=1 +export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) + # for running locally, edit as necessary # export PYTHONHOME=$HOME/anaconda3 # export PYTHON=python3.6 diff --git a/workflows/GA/test/cfg-sys-paccmann-1.sh b/workflows/GA/test/cfg-sys-paccmann-1.sh new file mode 100644 index 00000000..ee2c9023 --- /dev/null +++ b/workflows/GA/test/cfg-sys-paccmann-1.sh @@ -0,0 +1,46 @@ +# CFG SYS PACCMAN 1 + +# The number of MPI processes +# Note that 2 processes are reserved for Swift/EMEMS +# The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs +export PROCS=${PROCS:-10} + +# MPI processes per node +# Cori has 32 cores per node, 128GB per node +export PPN=${PPN:-1} + +export WALLTIME=${WALLTIME:-01:00:00} + +#export PROJECT=Candle_ECP + +# Benchmark run timeout: benchmark run will timeout +# after the specified number of seconds. +# If set to -1 there is no timeout. +# This timeout is implemented with Keras callbacks +BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} + +# Uncomment below to use custom python script to run +# Use file name without .py (e.g, my_script.py) +# MODEL_PYTHON_SCRIPT=my_script + +# Shell timeout: benchmark run will be killed +# after the specified number of seconds. +# If set to -1 or empty there is no timeout. +# This timeout is implemented with the shell command 'timeout' +export SH_TIMEOUT=${SH_TIMEOUT:-} + +# Ignore errors: If 1, unknown errors will be reported to model.log +# but will not bring down the Swift workflow. See model.sh . +export IGNORE_ERRORS=0 + +# if the deap python package is not installed with swift-t's embedded python +# it may be ncessary to include its location in the PYTHONPATH +# export PYTHONPATH=/global/u1/n/ncollier/.local/cori/deeplearning2.7/lib/python2.7/site-packages + +export TURBINE_RESIDENT_WORK_WORKERS=1 +export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) + +# for running locally, edit as necessary +# export PYTHONHOME=$HOME/anaconda3 +# export PYTHON=python3.6 +# export SWIFT_T=$HOME/sfw/swift-t-4c8f0afd diff --git a/workflows/GA/test/cfg-sys-polaris.sh b/workflows/GA/test/cfg-sys-polaris.sh new file mode 100644 index 00000000..9af5cbcf --- /dev/null +++ b/workflows/GA/test/cfg-sys-polaris.sh @@ -0,0 +1,47 @@ +# +# COMBO CFG SYS 1 + +# The number of MPI processes +# Note that 2 processes are reserved for Swift/EMEMS +# The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs +export PROCS=${PROCS:-11} + +# MPI processes per node +# Cori has 32 cores per node, 128GB per node +export PPN=${PPN:-11} +export QUEUE=${QUEUE:-debug-scaling} +export WALLTIME=${WALLTIME:-00:59:00} + +#export PROJECT=Candle_ECP + +# Benchmark run timeout: benchmark run will timeout +# after the specified number of seconds. +# If set to -1 there is no timeout. +# This timeout is implemented with Keras callbacks +BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} + +# Uncomment below to use custom python script to run +# Use file name without .py (e.g, my_script.py) +# MODEL_PYTHON_SCRIPT=my_script + +# Shell timeout: benchmark run will be killed +# after the specified number of seconds. +# If set to -1 or empty there is no timeout. +# This timeout is implemented with the shell command 'timeout' +export SH_TIMEOUT=${SH_TIMEOUT:-} + +# Ignore errors: If 1, unknown errors will be reported to model.log +# but will not bring down the Swift workflow. See model.sh . +export IGNORE_ERRORS=0 + +# if the deap python package is not installed with swift-t's embedded python +# it may be ncessary to include its location in the PYTHONPATH +# export PYTHONPATH=/global/u1/n/ncollier/.local/cori/deeplearning2.7/lib/python2.7/site-packages + +export TURBINE_RESIDENT_WORK_WORKERS=1 +export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) + +# for running locally, edit as necessary +# export PYTHONHOME=$HOME/anaconda3 +# export PYTHON=python3.6 +# export SWIFT_T=$HOME/sfw/swift-t-4c8f0afd diff --git a/workflows/GA/test/cfg-sys-summit.sh b/workflows/GA/test/cfg-sys-summit.sh new file mode 100644 index 00000000..38ee21f8 --- /dev/null +++ b/workflows/GA/test/cfg-sys-summit.sh @@ -0,0 +1,47 @@ +# +# COMBO CFG SYS 1 + +# The number of MPI processes +# Note that 2 processes are reserved for Swift/EMEMS +# The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs +export PROCS=${PROCS:-276} + +# MPI processes per node +# Cori has 32 cores per node, 128GB per node +export PPN=${PPN:-6} + +# for summit use these options +export TURBINE_LAUNCH_OPTIONS="-a 1 -g 1 -c 1" + +export WALLTIME=${WALLTIME:-06:00:00} + +#export PROJECT=Candle_ECP + +# Benchmark run timeout: benchmark run will timeout +# after the specified number of seconds. +# If set to -1 there is no timeout. +# This timeout is implemented with Keras callbacks +BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} + +# Uncomment below to use custom python script to run +# Use file name without .py (e.g, my_script.py) +# MODEL_PYTHON_SCRIPT=my_script + +# Shell timeout: benchmark run will be killed +# after the specified number of seconds. +# If set to -1 or empty there is no timeout. +# This timeout is implemented with the shell command 'timeout' +export SH_TIMEOUT=${SH_TIMEOUT:-} + +# Ignore errors: If 1, unknown errors will be reported to model.log +# but will not bring down the Swift workflow. See model.sh . +export IGNORE_ERRORS=0 + +# if the deap python package is not installed with swift-t's embedded python +# it may be ncessary to include its location in the PYTHONPATH +# export PYTHONPATH=/global/u1/n/ncollier/.local/cori/deeplearning2.7/lib/python2.7/site-packages + +# for running locally, edit as necessary +# export PYTHONHOME=$HOME/anaconda3 +# export PYTHON=python3.6 +# export SWIFT_T=$HOME/sfw/swift-t-4c8f0afd diff --git a/workflows/GA/test/cfg-sys-tcnns-1.sh b/workflows/GA/test/cfg-sys-tcnns-1.sh new file mode 100644 index 00000000..ee2c9023 --- /dev/null +++ b/workflows/GA/test/cfg-sys-tcnns-1.sh @@ -0,0 +1,46 @@ +# CFG SYS PACCMAN 1 + +# The number of MPI processes +# Note that 2 processes are reserved for Swift/EMEMS +# The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs +export PROCS=${PROCS:-10} + +# MPI processes per node +# Cori has 32 cores per node, 128GB per node +export PPN=${PPN:-1} + +export WALLTIME=${WALLTIME:-01:00:00} + +#export PROJECT=Candle_ECP + +# Benchmark run timeout: benchmark run will timeout +# after the specified number of seconds. +# If set to -1 there is no timeout. +# This timeout is implemented with Keras callbacks +BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} + +# Uncomment below to use custom python script to run +# Use file name without .py (e.g, my_script.py) +# MODEL_PYTHON_SCRIPT=my_script + +# Shell timeout: benchmark run will be killed +# after the specified number of seconds. +# If set to -1 or empty there is no timeout. +# This timeout is implemented with the shell command 'timeout' +export SH_TIMEOUT=${SH_TIMEOUT:-} + +# Ignore errors: If 1, unknown errors will be reported to model.log +# but will not bring down the Swift workflow. See model.sh . +export IGNORE_ERRORS=0 + +# if the deap python package is not installed with swift-t's embedded python +# it may be ncessary to include its location in the PYTHONPATH +# export PYTHONPATH=/global/u1/n/ncollier/.local/cori/deeplearning2.7/lib/python2.7/site-packages + +export TURBINE_RESIDENT_WORK_WORKERS=1 +export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) + +# for running locally, edit as necessary +# export PYTHONHOME=$HOME/anaconda3 +# export PYTHON=python3.6 +# export SWIFT_T=$HOME/sfw/swift-t-4c8f0afd diff --git a/workflows/GA/test/test-1.sh b/workflows/GA/test/test-1.sh index 002b414c..bc36cec4 100755 --- a/workflows/GA/test/test-1.sh +++ b/workflows/GA/test/test-1.sh @@ -38,6 +38,10 @@ export CFG_PRM=$THIS/cfg-prm-1.sh # The python GA model exploration algorithm export GA_FILE=deap_ga.py +CANDLE_MODEL_TYPE="BENCHMARKS" +# CANDLE_IMAGE=/software/improve/images/GraphDRP.sif # lambda +CANDLE_IMAGE=None # Polaris + # What to return from the objective function (Keras model) # val_loss (default) and val_corr are supported export OBJ_RETURN="val_loss" @@ -48,20 +52,15 @@ then fi # Submit job -$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME - - -# Wait for job -queue_wait - -echo "TO: $TURBINE_OUTPUT" - -cp $0 $TURBINE_OUTPUT +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $CFG_PRM $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE # Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +echo $TURBINE_OUTPUT OUTPUT=$TURBINE_OUTPUT/output.txt WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) SCRIPT=$( basename $0 .sh ) -check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID +#check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID echo "$SCRIPT: SUCCESS" diff --git a/workflows/GA/test/test-bmk.sh b/workflows/GA/test/test-bmk.sh new file mode 100755 index 00000000..59499ec4 --- /dev/null +++ b/workflows/GA/test/test-bmk.sh @@ -0,0 +1,69 @@ +#!/bin/bash +set -eu + +# GA TEST BMK +# Runs any CANDLE Benchmark using MODEL_IMPL="app" + +usage() +{ + echo "Usage: test BENCHMARK_NAME SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 3 )) +then + RUN_DIR=$3 +elif (( ${#} == 2 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-1.sh +export CFG_PRM=$THIS/cfg-prm-1.sh +# export PARAM_SET_FILE=graphdrp_param_space_ga.json + +# The python GA model exploration algorithm +export GA_FILE=deap_ga.py + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export MODEL_RETURN="val_loss" + +export CANDLE_MODEL_TYPE="BENCHMARKS" +export CANDLE_IMAGE=NONE +export CANDLE_MODEL_IMPL="app" + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +echo $TURBINE_OUTPUT +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) +#check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: diff --git a/workflows/GA/test/test-graphdrp-lambda0.sh b/workflows/GA/test/test-graphdrp-lambda0.sh new file mode 100755 index 00000000..599965a8 --- /dev/null +++ b/workflows/GA/test/test-graphdrp-lambda0.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -eu + +# GA TEST GRAPHDRP LAMBDA + +usage() +{ + echo "Usage: test BENCHMARK_NAME SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 3 )) +then + RUN_DIR=$3 +elif (( ${#} == 2 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-1.sh +export CFG_PRM=$THIS/cfg-prm-1.sh +export PARAM_SET_FILE=graphdrp_param_space_ga.json + +# The python GA model exploration algorithm +export GA_FILE=deap_ga.py + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export MODEL_RETURN="val_loss" + +export CANDLE_MODEL_TYPE="SINGULARITY" +export CANDLE_IMAGE="/software/improve/images/GraphDRP.sif" + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +echo $TURBINE_OUTPUT +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) +#check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: diff --git a/workflows/GA/test/test-igtd-lambda0.sh b/workflows/GA/test/test-igtd-lambda0.sh new file mode 100755 index 00000000..314f1acb --- /dev/null +++ b/workflows/GA/test/test-igtd-lambda0.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -eu + +# GA TEST GRAPHDRP LAMBDA + +usage() +{ + echo "Usage: test BENCHMARK_NAME SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 3 )) +then + RUN_DIR=$3 +elif (( ${#} == 2 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-1.sh +export CFG_PRM=$THIS/cfg-prm-1.sh +export PARAM_SET_FILE=../data/igtd_param_space_ga.json + +# The python GA model exploration algorithm +export GA_FILE=deap_ga.py + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export MODEL_RETURN="val_loss" + +export CANDLE_MODEL_TYPE="SINGULARITY" +export CANDLE_IMAGE="/software/improve/images/IGTD.sif" + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +echo $TURBINE_OUTPUT +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) +#check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: diff --git a/workflows/GA/test/test-polaris.sh b/workflows/GA/test/test-polaris.sh new file mode 100755 index 00000000..5a16bba2 --- /dev/null +++ b/workflows/GA/test/test-polaris.sh @@ -0,0 +1,38 @@ +#!/bin/bash +set -eu + +# TEST POLARIS +# For GraphDRP + +if (( ${#} != 1 )) +then + echo "usage: test SITE" + exit 1 +fi + +SITE=$1 + +# Self-configure +THIS=$( cd $( dirname $0 ) ; /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. ; /bin/pwd ) +WORKFLOWS_ROOT=$( cd $THIS/../.. ; /bin/pwd ) +export EMEWS_PROJECT_ROOT + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-polaris.sh +export CFG_PRM=$THIS/cfg-prm-polaris.sh + +# Specify GA file +export GA_FILE=deap_ga.py + +CANDLE_MODEL_TYPE="SINGULARITY" +# CANDLE_IMAGE=/software/improve/images/GraphDRP.sif # lambda +CANDLE_IMAGE=/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif # Polaris + +export MODEL_NAME="graphdrp" + +# Currently ignored: +export OBJ_RETURN="val_loss" + +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $CFG_PRM $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE diff --git a/workflows/GA/test/test-random-lambda7.sh b/workflows/GA/test/test-random-lambda7.sh new file mode 100755 index 00000000..9ca8a27c --- /dev/null +++ b/workflows/GA/test/test-random-lambda7.sh @@ -0,0 +1,68 @@ +#!/bin/bash +set -eu + +# GA TEST GRAPHDRP LAMBDA + +usage() +{ + echo "Usage: test BENCHMARK_NAME SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 3 )) +then + RUN_DIR=$3 +elif (( ${#} == 2 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-1.sh +export CFG_PRM=$THIS/cfg-prm-1.sh +export PARAM_SET_FILE=random_param_space.json + +# The python GA model exploration algorithm +export GA_FILE=deap_ga.py + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export MODEL_RETURN="val_loss" + +export CANDLE_MODEL_TYPE="BENCHMARKS" +export CANDLE_IMAGE=NONE +export CANDLE_MODEL_IMPL="app" + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +echo $TURBINE_OUTPUT +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) +#check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: diff --git a/workflows/GA/test/test-sif-paccmann-1.sh b/workflows/GA/test/test-sif-paccmann-1.sh new file mode 100755 index 00000000..9d56f8b9 --- /dev/null +++ b/workflows/GA/test/test-sif-paccmann-1.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -eu + +# GA TEST SIF Paccman + +usage() +{ + echo "Usage: test SIF SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 3 )) +then + RUN_DIR=$3 +elif (( ${#} == 2 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-paccmann-1.sh +export CFG_PRM=$THIS/cfg-prm-paccmann-1.sh +export PARAM_SET_FILE=paccmann_param_space.json + +# The Python GA model exploration algorithm +export GA_FILE=deap_ga.py + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export MODEL_RETURN="val_loss" + +export CANDLE_MODEL_TYPE="SINGULARITY" +export CANDLE_IMAGE=$MODEL_NAME +export CANDLE_MODEL_IMPL="app" + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +echo $TURBINE_OUTPUT +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: diff --git a/workflows/GA/test/test-sif-tcnns-1.sh b/workflows/GA/test/test-sif-tcnns-1.sh new file mode 100755 index 00000000..507a32cc --- /dev/null +++ b/workflows/GA/test/test-sif-tcnns-1.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -eu + +# GA TEST SIF tCNNS + +usage() +{ + echo "Usage: test SIF SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 3 )) +then + RUN_DIR=$3 +elif (( ${#} == 2 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-tcnns-1.sh +export CFG_PRM=$THIS/cfg-prm-tcnns-1.sh +export PARAM_SET_FILE=tcnns_param_space.json + +# The Python GA model exploration algorithm +export GA_FILE=deap_ga.py + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export MODEL_RETURN="val_loss" + +export CANDLE_MODEL_TYPE="SINGULARITY" +export CANDLE_IMAGE=$MODEL_NAME +export CANDLE_MODEL_IMPL="app" + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +echo $TURBINE_OUTPUT +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: diff --git a/workflows/GA/test/test-sif.sh b/workflows/GA/test/test-sif.sh new file mode 100755 index 00000000..ae1de8de --- /dev/null +++ b/workflows/GA/test/test-sif.sh @@ -0,0 +1,68 @@ +#!/bin/bash +set -eu + +# GA TEST SIF +# Runs any IMPROVE container + +usage() +{ + echo "Usage: test SIF SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 3 )) +then + RUN_DIR=$3 +elif (( ${#} == 2 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-1.sh +export CFG_PRM=$THIS/cfg-prm-1.sh +export PARAM_SET_FILE=graphdrp_param_space_ga.json + +# The Python GA model exploration algorithm +export GA_FILE=deap_ga.py + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export MODEL_RETURN="val_loss" + +export CANDLE_MODEL_TYPE="SINGULARITY" +export CANDLE_IMAGE=$MODEL_NAME +export CANDLE_MODEL_IMPL="app" + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +echo $TURBINE_OUTPUT +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: diff --git a/workflows/GA/test/test-summit.sh b/workflows/GA/test/test-summit.sh new file mode 100755 index 00000000..e1fefa9a --- /dev/null +++ b/workflows/GA/test/test-summit.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -eu + +# GA TEST 1 + +usage() +{ + echo "Usage: test BENCHMARK_NAME SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 3 )) +then + RUN_DIR=$3 +elif (( ${#} == 2 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-summit.sh +export CFG_PRM=$THIS/cfg-prm-summit.sh + +# The python GA model exploration algorithm +export GA_FILE=deap_ga.py + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export OBJ_RETURN="val_loss" + +if [[ $SITE == "theta" ]] +then + export WAIT=1 +fi + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME + + +# Wait for job +queue_wait + +echo "TO: $TURBINE_OUTPUT" + +cp $0 $TURBINE_OUTPUT +# Check job output +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) +check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID + +echo "$SCRIPT: SUCCESS" diff --git a/workflows/async-horovod/Problem.py b/workflows/async-horovod/Problem.py index 75760d58..b039f42d 100644 --- a/workflows/async-horovod/Problem.py +++ b/workflows/async-horovod/Problem.py @@ -1,21 +1,31 @@ - # PROBLEM # The bounding box for the optimization problem # This should be a user plug-in from collections import OrderedDict -class Problem(): + +class Problem: + def __init__(self): space = OrderedDict() - #problem specific parameters + # problem specific parameters # space['drop'] = (0, 0.9) # space['batch_size'] = [16, 32, 64, 128, 256, 512] # space['p3'] = [2 , 4, 8, 16, 32, 64, 128] # space['p4'] = ['a', 'b', 'c'] # space["learning_rate"] = (0,0.009) # Make discrete values - space["learning_rate"] = [ 0.001, 0.002, 0.003, 0.004, 0.005, - 0.006, 0.007, 0.008, 0.009 ] + space["learning_rate"] = [ + 0.001, + 0.002, + 0.003, + 0.004, + 0.005, + 0.006, + 0.007, + 0.008, + 0.009, + ] # Use 5 epochs # Add Horovod PARALLELISM [ 64, 128 , 256, 512 ] # ? 1.5h ? ? @@ -24,6 +34,7 @@ def __init__(self): self.params = self.space.keys() self.starting_point = [0.1, 16] + # if __name__ == '__main__': # instance = Problem() # print(instance.space) diff --git a/workflows/async-horovod/Task.py b/workflows/async-horovod/Task.py index c7ab03ee..52c10511 100644 --- a/workflows/async-horovod/Task.py +++ b/workflows/async-horovod/Task.py @@ -1,10 +1,11 @@ - # TASK # This should be a user plug-in from __future__ import print_function + import os + class Task: def __init__(self, logger, output, script, parallelism, number, params): @@ -18,7 +19,8 @@ def __init__(self, logger, output, script, parallelism, number, params): self.params = params def go(self): - import json, subprocess + import json + import subprocess J = json.loads(self.params) learning_rate = J["learning_rate"] @@ -26,9 +28,14 @@ def go(self): self.open_output() try: - args = [ self.script, self.output, "%04i"%self.number, - str(self.parallelism), - "adam", str(learning_rate) ] + args = [ + self.script, + self.output, + "%04i" % self.number, + str(self.parallelism), + "adam", + str(learning_rate), + ] self.logger.debug("task: " + " ".join(args)) self.process = subprocess.Popen(args=args, stdin=None, @@ -37,6 +44,7 @@ def go(self): print("started: ", self.process.pid) except Exception as e: import traceback + traceback.print_exc() print("") print("error while attempting to run: " + " ".join(args)) @@ -51,8 +59,9 @@ def open_output(self): except Exception as e: print("") from utils import fail - fail("Could not open task output file: " + - output_file + "\n" + str(e)) + + fail("Could not open task output file: " + output_file + "\n" + + str(e)) def __del__(self): if self.fd is not None: diff --git a/workflows/async-horovod/main.py b/workflows/async-horovod/main.py index 772fab43..eaa8d70f 100644 --- a/workflows/async-horovod/main.py +++ b/workflows/async-horovod/main.py @@ -1,74 +1,90 @@ - # MAIN PY # The main code for the search algorithm from __future__ import print_function -import logging, os, sys, time - -from utils import * +import logging +import os +import sys +import time from Problem import Problem from Task import Task +from utils import * logger = logging.getLogger(__name__) + def main(): setup_log(logging.INFO) parallelism, points_init, points_max, cfg, output = parse_args() script, launch_delay = read_cfg(cfg) output = setup_run(output) problem, optimizer = setup_optz() - success = search(problem, optimizer, output, script, launch_delay, - parallelism, points_init, points_max) + success = search( + problem, + optimizer, + output, + script, + launch_delay, + parallelism, + points_init, + points_max, + ) print("Workflow success!" if success else "Workflow failed!") + def setup_log(level): - """ Note that the log level may be changed by the cfg file """ - logging.basicConfig(level=level, - format='%(asctime)s %(levelname)s: %(message)s', - datefmt='%Y/%m/%d %H:%M:%S') + """Note that the log level may be changed by the cfg file.""" + logging.basicConfig( + level=level, + format="%(asctime)s %(levelname)s: %(message)s", + datefmt="%Y/%m/%d %H:%M:%S", + ) logger.debug("DEBUG") + def parse_args(): import argparse + parser = argparse.ArgumentParser() - parser.add_argument("parallelism", - help="Nodes per Keras run") - parser.add_argument("points_init", - help="Number of initial sample points") - parser.add_argument("points_max", - help="Number of total sample points") - parser.add_argument("cfg_file", - help="The cfg file (see README)") + parser.add_argument("parallelism", help="Nodes per Keras run") + parser.add_argument("points_init", help="Number of initial sample points") + parser.add_argument("points_max", help="Number of total sample points") + parser.add_argument("cfg_file", help="The cfg file (see README)") parser.add_argument("output_directory", help="The output directory (see README)") args = parser.parse_args() print_namespace("optimizer settings:", args) - return (int(args.parallelism), - int(args.points_init), - int(args.points_max), - args.cfg_file, - args.output_directory) + return ( + int(args.parallelism), + int(args.points_init), + int(args.points_max), + args.cfg_file, + args.output_directory, + ) + def read_cfg(cfg): import json + try: with open(cfg) as fp: J = json.load(fp) except: fail("Could not open: " + cfg) - defaults = { "launch_delay" : 0, - "log_level" : "INFO" } + defaults = {"launch_delay": 0, "log_level": "INFO"} for d in defaults: if not d in J: J[d] = defaults[d] - check(is_integer(J["launch_delay"]), - "launch_delay must be integer if present: launch_delay='%s'" % - str(J["launch_delay"])) + check( + is_integer(J["launch_delay"]), + "launch_delay must be integer if present: launch_delay='%s'" % + str(J["launch_delay"]), + ) global logger level = string2level(J["log_level"]) @@ -76,6 +92,7 @@ def read_cfg(cfg): return J["script"], J["launch_delay"] + def setup_run(output): if not output[0] == "/": output = os.getcwd() + "/" + output @@ -86,10 +103,10 @@ def setup_run(output): os.makedirs(output) os.chdir(output) except Exception as e: - fail("could not make output directory: " + - output + "\n" + str(e)) + fail("could not make output directory: " + output + "\n" + str(e)) return output + def setup_optz(): logger.debug("setup() START") @@ -101,15 +118,30 @@ def setup_optz(): seed = 42 # Start the optimizer - parDict = { 'kappa' : 1.96 } + parDict = {"kappa": 1.96} space = [problem.space[key] for key in problem.params] - optimizer = Optimizer(space, base_estimator='RF', acq_optimizer='sampling', - acq_func='LCB', acq_func_kwargs={}, random_state=seed) + optimizer = Optimizer( + space, + base_estimator="RF", + acq_optimizer="sampling", + acq_func="LCB", + acq_func_kwargs={}, + random_state=seed, + ) logger.debug("setup() STOP") return (problem, optimizer) -def search(problem, optimizer, output, script, launch_delay, - parallelism, points_init, points_max): + +def search( + problem, + optimizer, + output, + script, + launch_delay, + parallelism, + points_init, + points_max, +): print("search start:") # Create the initial sample points @@ -133,8 +165,12 @@ def search(problem, optimizer, output, script, launch_delay, for i, json in enumerate(jsons): # Note: this puts the task in a background process global logger - T = Task(logger, output, script, - parallelism, number=task_count, params=json) + T = Task(logger, + output, + script, + parallelism, + number=task_count, + params=json) status = T.go() if not status: success = False @@ -169,22 +205,23 @@ def search(problem, optimizer, output, script, launch_delay, points = [] return success + def read_val_loss(output, task): - filename = output+"/val_loss-%04i.txt" % task.number + filename = output + "/val_loss-%04i.txt" % task.number try: with open(filename, "r") as fp: result = fp.read() result = result.strip() except Exception as e: - fail("Could not open result file: " + - filename + "\n" + str(e)) + fail("Could not open result file: " + filename + "\n" + str(e)) try: number = float(result) except Exception as e: - fail("Invalid number \"" + result + "\" in result file: " + - filename + "\n" + str(e)) + fail('Invalid number "' + result + '" in result file: ' + filename + + "\n" + str(e)) + + return number - return number -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/workflows/async-horovod/uno-1.json b/workflows/async-horovod/uno-1.json index ba9ef727..18f30904 100644 --- a/workflows/async-horovod/uno-1.json +++ b/workflows/async-horovod/uno-1.json @@ -1,5 +1,5 @@ { - "log_level" : "DEBUG", - "launch_delay": 1, - "script": "/home/wozniak/proj/SV/workflows/async-horovod/run-uno.sh" + "log_level": "DEBUG", + "launch_delay": 1, + "script": "/home/wozniak/proj/SV/workflows/async-horovod/run-uno.sh" } diff --git a/workflows/async-horovod/utils.py b/workflows/async-horovod/utils.py index bfa3f2be..8236dbd3 100644 --- a/workflows/async-horovod/utils.py +++ b/workflows/async-horovod/utils.py @@ -1,7 +1,8 @@ - import json + class MyEncoder(json.JSONEncoder): + def default(self, obj): if isinstance(obj, np.integer): return int(obj) @@ -12,36 +13,42 @@ def default(self, obj): else: return super(MyEncoder, self).default(obj) + def is_integer(v): try: - t = v+1 + t = v + 1 except: return False return True + def check(condition, msg): if not condition: fail(msg) + def fail(msg): print(msg) import sys + sys.exit(1) + def string2level(s): import logging - table = { "" : logging.INFO, - "INFO" : logging.INFO, - "DEBUG" : logging.DEBUG } + + table = {"": logging.INFO, "INFO": logging.INFO, "DEBUG": logging.DEBUG} check(s in table, "Invalid log level: " + s) return table[s] + def depth(l): if isinstance(l, list): return 1 + max(depth(item) for item in l) else: return 0 + def create_list_of_json_strings(list_of_lists, params, super_delim=";"): if len(list_of_lists) == 0: @@ -50,12 +57,12 @@ def create_list_of_json_strings(list_of_lists, params, super_delim=";"): # create string of ; separated jsonified maps result = [] - if (depth(list_of_lists) == 1): + if depth(list_of_lists) == 1: list_of_lists = [list_of_lists] for l in list_of_lists: jmap = {} - for i,p in enumerate(params): + for i, p in enumerate(params): jmap[p] = l[i] jstring = json.dumps(jmap, cls=MyEncoder) @@ -63,9 +70,10 @@ def create_list_of_json_strings(list_of_lists, params, super_delim=";"): return result + def print_namespace(title, ns): print("") print(title) - for k,v in vars(ns).items(): + for k, v in vars(ns).items(): print(" %s %s" % (k, v)) print("") diff --git a/workflows/async-local/Problem.py b/workflows/async-local/Problem.py index f0c0d18e..ed4a8693 100644 --- a/workflows/async-local/Problem.py +++ b/workflows/async-local/Problem.py @@ -1,21 +1,24 @@ - # PROBLEM # The bounding box for the optimization problem # This should be a user plug-in from collections import OrderedDict -class Problem(): + + +class Problem: + def __init__(self): space = OrderedDict() - #problem specific parameters - space['drop'] = (0, 0.9) - space['batch_size'] = [16, 32, 64, 128, 256, 512] - #space['p3'] = [2 , 4, 8, 16, 32, 64, 128] - #space['p4'] = ['a', 'b', 'c'] + # problem specific parameters + space["drop"] = (0, 0.9) + space["batch_size"] = [16, 32, 64, 128, 256, 512] + # space['p3'] = [2 , 4, 8, 16, 32, 64, 128] + # space['p4'] = ['a', 'b', 'c'] self.space = space self.params = self.space.keys() self.starting_point = [0.1, 16] + # if __name__ == '__main__': # instance = Problem() # print(instance.space) diff --git a/workflows/async-local/Task.py b/workflows/async-local/Task.py index 7cabd2fe..d9ea6d50 100644 --- a/workflows/async-local/Task.py +++ b/workflows/async-local/Task.py @@ -1,10 +1,11 @@ - # TASK # This should be a user plug-in from __future__ import print_function + import os + class Task: def __init__(self, parallelism, number, params): @@ -16,13 +17,14 @@ def __init__(self, parallelism, number, params): def go(self): import subprocess + # script = "/home/wozniak/proj/SV/workflows/test-horovod/template-theta.sh" script = "/home/wozniak/proj/SV/workflows/async-local/task.sh" try: output = get_output() log = output + ("/%04i.txt" % self.number) self.fd = open(log, "w") - args = [script, str(self.parallelism), self.params], + args = ([script, str(self.parallelism), self.params],) self.process = subprocess.Popen(args=args, stdin=None, stdout=self.fd, @@ -30,6 +32,7 @@ def go(self): print("started: ", self.process.pid) except Exception as e: import traceback + traceback.print_exc() return False return True @@ -39,6 +42,7 @@ def __del__(self): print("closing: " + str(self.number)) self.fd.close() + def get_output(): o = os.getenv("OUTPUT") if o is None: diff --git a/workflows/async-local/main.py b/workflows/async-local/main.py index b75619d3..6a3bb5b6 100644 --- a/workflows/async-local/main.py +++ b/workflows/async-local/main.py @@ -1,42 +1,47 @@ - # MAIN PY # The main code for the search algorithm from __future__ import print_function -import logging, os, sys, time - -from utils import * +import logging +import os +import sys +import time from Problem import Problem from Task import Task +from utils import * logger = logging.getLogger(__name__) + def main(): setup_log() parallelism, points_init, points_max = parse_args() problem, optimizer = setup() - success = search(problem, optimizer, - parallelism, points_init, points_max) + success = search(problem, optimizer, parallelism, points_init, points_max) print("Workflow success!" if success else "Workflow failed!") + def setup_log(): - logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s: %(message)s', - datefmt='%Y/%m/%d %H:%M:%S') + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s: %(message)s", + datefmt="%Y/%m/%d %H:%M:%S", + ) + def parse_args(): import argparse + parser = argparse.ArgumentParser() parser.add_argument("parallelism") parser.add_argument("points_init") parser.add_argument("points_max") args = parser.parse_args() print_namespace("optimizer settings:", args) - return (int(args.parallelism), - int(args.points_init), - int(args.points_max)) + return (int(args.parallelism), int(args.points_init), int(args.points_max)) + def setup(): @@ -49,13 +54,20 @@ def setup(): seed = 42 # Start the optimizer - parDict = { 'kappa' : 1.96 } + parDict = {"kappa": 1.96} space = [problem.space[key] for key in problem.params] - optimizer = Optimizer(space, base_estimator='RF', acq_optimizer='sampling', - acq_func='LCB', acq_func_kwargs={}, random_state=seed) + optimizer = Optimizer( + space, + base_estimator="RF", + acq_optimizer="sampling", + acq_func="LCB", + acq_func_kwargs={}, + random_state=seed, + ) logger.debug("setup() STOP") return (problem, optimizer) + def search(problem, optimizer, parallelism, points_init, points_max): print("search start:") @@ -113,5 +125,6 @@ def search(problem, optimizer, parallelism, points_init, points_max): points = [] return success -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/workflows/async-local/utils.py b/workflows/async-local/utils.py index 89691668..3efadb76 100644 --- a/workflows/async-local/utils.py +++ b/workflows/async-local/utils.py @@ -1,7 +1,8 @@ - import json + class MyEncoder(json.JSONEncoder): + def default(self, obj): if isinstance(obj, np.integer): return int(obj) @@ -12,12 +13,14 @@ def default(self, obj): else: return super(MyEncoder, self).default(obj) + def depth(l): if isinstance(l, list): return 1 + max(depth(item) for item in l) else: return 0 + def create_list_of_json_strings(list_of_lists, params, super_delim=";"): if len(list_of_lists) == 0: @@ -26,12 +29,12 @@ def create_list_of_json_strings(list_of_lists, params, super_delim=";"): # create string of ; separated jsonified maps result = [] - if (depth(list_of_lists) == 1): + if depth(list_of_lists) == 1: list_of_lists = [list_of_lists] for l in list_of_lists: jmap = {} - for i,p in enumerate(params): + for i, p in enumerate(params): jmap[p] = l[i] jstring = json.dumps(jmap, cls=MyEncoder) @@ -39,9 +42,10 @@ def create_list_of_json_strings(list_of_lists, params, super_delim=";"): return result + def print_namespace(title, ns): print("") print(title) - for k,v in vars(ns).items(): + for k, v in vars(ns).items(): print(" %s %s" % (k, v)) print("") diff --git a/workflows/async-search/README.md b/workflows/async-search/README.md index 42eec684..e236f461 100644 --- a/workflows/async-search/README.md +++ b/workflows/async-search/README.md @@ -2,33 +2,33 @@ async-search is an asynchronous iterative optimizer written in Python. It evaluates the best values of hyperparameters for CANDLE "Benchmarks" available here: `git@github.com:ECP-CANDLE/Benchmarks.git` -## Running ## +## Running 1. cd into the **Supervisor/workflows/async-search/test** directory -2. Specify the async-search parameters in the *cfg-prm-1.sh* file (INIT_SIZE, etc.). +2. Specify the async-search parameters in the _cfg-prm-1.sh_ file (INIT_SIZE, etc.). 3. Specify the PROCS, queue etc. in **cfg-sys-1.sh** file -(NOTE: currently INIT_SIZE must be at least PROCS-2) + (NOTE: currently INIT_SIZE must be at least PROCS-2) 4. You will pass the MODEL_NAME, SITE, and optional experiment id arguments to **test-1.sh** file when launching: -`./test-1.sh [expid]` -where `model_name` can be tc1 etc., `machine_name` can be local, cori, theta, titan etc. (see [NOTE](#making_changes) below on creating new SITE files.) + `./test-1.sh [expid]` + where `model_name` can be tc1 etc., `machine_name` can be local, cori, theta, titan etc. (see [NOTE](#making_changes) below on creating new SITE files.) 5. The parameter space is defined in a **problem\*.py** file (see **workflows/async-search/python/problem_tc1.py** for an example with tc1.). This is imported as `problem` in **async-search.py**. 6. The benchmark will be run for the number of processors specified 7. Final objective function values, along with parameters, will be available in the experiments directory and also printed - -## User requirements ## +## User requirements What you need to install to run the workflow: -* This workflow - `git@github.com:ECP-CANDLE/Supervisor.git` . +- This workflow - `git@github.com:ECP-CANDLE/Supervisor.git` . Clone and switch to the `master` branch. Then `cd` to `workflows/async-search` (the directory containing this README). -* TC1 benchmark - `git@github.com:ECP-CANDLE/Benchmarks.git` . +- TC1 benchmark - `git@github.com:ECP-CANDLE/Benchmarks.git` . Clone and switch to the `frameworks` branch. -* benchmark data - - See the individual benchmarks README for obtaining the initial data +- benchmark data - + See the individual benchmarks README for obtaining the initial data + +Python specific installation needed: - Python specific installation needed: ``` conda install h5py conda install scikit-learn @@ -38,9 +38,10 @@ conda install -c conda-forge keras conda install -c conda-forge scikit-optimize ``` -## Calling sequence ## +## Calling sequence Function calls: + ``` test-1.sh -> swift/workflow.sh -> @@ -56,13 +57,16 @@ test-1.sh -> swift/workflow.sh -> ``` Scheduling scripts: + ``` test-1.sh -> cfg-sys-1.sh -> common/sh/ - module, scheduling, langs .sh files ``` + ## Making Changes To create your own SITE files in workflows/common/sh/: + - langs-SITE.sh - langs-app-SITE.sh - modules-SITE.sh @@ -70,27 +74,28 @@ To create your own SITE files in workflows/common/sh/: copy existing ones but modify the langs-SITE.sh file to define the EQPy location (see workflows/common/sh/langs-local.sh for an example). -### Structure ### +### Structure -The point of the script structure is that it is easy to make copy and modify the `test-*.sh` script, and the `cfg-*.sh` scripts. These can be checked back into the repo for use by others. The `test-*.sh` script and the `cfg-*.sh` scripts should simply contain environment variables that control how `workflow.sh` and `workflow.swift` operate. +The point of the script structure is that it is easy to make copy and modify the `test-*.sh` script, and the `cfg-*.sh` scripts. These can be checked back into the repo for use by others. The `test-*.sh` script and the `cfg-*.sh` scripts should simply contain environment variables that control how `workflow.sh` and `workflow.swift` operate. `test-1.sh` and `cfg-{sys,prm}-1.sh` should be unmodified for simple testing. The relevant parameters for the asynchronous search algorithm are defined in `cfg-*.sh` scripts (see example in `cfg-prm-1.sh`). These are: + - INIT_SIZE: The number of initial random samples. (Note: INIT_SIZE needs to be larger than PROCS-2 for now.) - MAX_EVALS: The maximum number of evaluations/tasks to perform. - NUM_BUFFER: The size of the tasks buffer that should be maintained above the available workers (num_workers) such that if the currently out tasks are less than (num workers + NUM_BUFFER), more tasks are generated. - MAX_THRESHOLD: Under normal circumstances, when a single model evaluation is finished, a new hyper parameter set is produced for evaluation. If the model evaluations occur within 15 seconds of each other, a MAX_THRESHOLD number of evalutions must occur before the corresponding number of new values are produced for evaluation. This can help with performance when many models finish within a few seconds of each other. - N_JOBS: The number of jobs to run in parallel when producing points (i.e. hyperparameter values) for evaluation. -1 will set this to the number of cores. -### Where to check for output ### +### Where to check for output This includes error output. -When you run the test script, you will get a message about `TURBINE_OUTPUT` . This will be the main output directory for your run. +When you run the test script, you will get a message about `TURBINE_OUTPUT` . This will be the main output directory for your run. -* On a local system, stdout/stderr for the workflow will go to your terminal. -* On a scheduled system, stdout/stderr for the workflow will go to `TURBINE_OUTPUT/output.txt` +- On a local system, stdout/stderr for the workflow will go to your terminal. +- On a scheduled system, stdout/stderr for the workflow will go to `TURBINE_OUTPUT/output.txt` The individual objective function (model) runs stdout/stderr go into directories of the form: diff --git a/workflows/async-search/python/as_problem.py b/workflows/async-search/python/as_problem.py index 4a996453..a8dad8c9 100644 --- a/workflows/async-search/python/as_problem.py +++ b/workflows/async-search/python/as_problem.py @@ -1,21 +1,31 @@ from collections import OrderedDict -class Problem(): + + +class Problem: + def __init__(self): space = OrderedDict() - #problem specific parameters - space['drop'] = (0, 0.9) - space['epochs'] = (2,3) - space['learning_rate'] = (0.00001, 0.1) - space['conv'] = ["50 50 50 50 50 1", "25 25 25 25 25 1", "64 32 16 32 64 1", "100 100 100 100 100 1", "32 20 16 32 10 1"] - space['optimizer'] = ["adam", "sgd", "rmsprop", "adagrad", "adadelta"] - space['batch_size'] = [16, 32, 64, 128, 256, 512] #, 256, 512] - #space['p3'] = [2 , 4, 8, 16, 32, 64, 128] - #space['p4'] = ['a', 'b', 'c'] + # problem specific parameters + space["drop"] = (0, 0.9) + space["epochs"] = (2, 3) + space["learning_rate"] = (0.00001, 0.1) + space["conv"] = [ + "50 50 50 50 50 1", + "25 25 25 25 25 1", + "64 32 16 32 64 1", + "100 100 100 100 100 1", + "32 20 16 32 10 1", + ] + space["optimizer"] = ["adam", "sgd", "rmsprop", "adagrad", "adadelta"] + space["batch_size"] = [16, 32, 64, 128, 256, 512] # , 256, 512] + # space['p3'] = [2 , 4, 8, 16, 32, 64, 128] + # space['p4'] = ['a', 'b', 'c'] self.space = space self.params = self.space.keys() self.starting_point = [0.1, 16] -if __name__ == '__main__': + +if __name__ == "__main__": instance = Problem() print(instance.space) - print(instance.params) \ No newline at end of file + print(instance.params) diff --git a/workflows/async-search/python/as_problem_tc1.py b/workflows/async-search/python/as_problem_tc1.py index b3db05f6..c751385f 100644 --- a/workflows/async-search/python/as_problem_tc1.py +++ b/workflows/async-search/python/as_problem_tc1.py @@ -1,17 +1,21 @@ from collections import OrderedDict -class Problem(): + + +class Problem: + def __init__(self): space = OrderedDict() - #problem specific parameters - space['drop'] = (0, 0.9) - space['batch_size'] = [16, 32, 64, 128] #, 256, 512] - #space['p3'] = [2 , 4, 8, 16, 32, 64, 128] - #space['p4'] = ['a', 'b', 'c'] + # problem specific parameters + space["drop"] = (0, 0.9) + space["batch_size"] = [16, 32, 64, 128] # , 256, 512] + # space['p3'] = [2 , 4, 8, 16, 32, 64, 128] + # space['p4'] = ['a', 'b', 'c'] self.space = space self.params = self.space.keys() self.starting_point = [0.1, 16] -if __name__ == '__main__': + +if __name__ == "__main__": instance = Problem() print(instance.space) print(instance.params) diff --git a/workflows/async-search/python/async-search.py b/workflows/async-search/python/async-search.py index 4d5b0df4..1bff744f 100644 --- a/workflows/async-search/python/async-search.py +++ b/workflows/async-search/python/async-search.py @@ -1,17 +1,19 @@ -from mpi4py import MPI -import eqpy -import time -import json -import numpy as np -from skopt import Optimizer -import as_problem as problem import datetime +import json import math import sys +import time + +import as_problem as problem +import eqpy +import numpy as np +from mpi4py import MPI +from skopt import Optimizer # list of ga_utils parameter objects problem_params = None + class MyEncoder(json.JSONEncoder): def default(self, obj): @@ -24,8 +26,10 @@ def default(self, obj): else: return super(MyEncoder, self).default(obj) + def create_points(num): - return(";".join([str(i) for i in range(num)])) + return ";".join([str(i) for i in range(num)]) + def depth(l): if isinstance(l, list): @@ -33,16 +37,17 @@ def depth(l): else: return 0 + def create_list_of_json_strings(list_of_lists, super_delim=";"): # create string of ; separated jsonified maps res = [] global problem_params - if (depth(list_of_lists) == 1): + if depth(list_of_lists) == 1: list_of_lists = [list_of_lists] for l in list_of_lists: jmap = {} - for i,p in enumerate(problem_params): + for i, p in enumerate(problem_params): jmap[p] = l[i] jstring = json.dumps(jmap, cls=MyEncoder) @@ -50,13 +55,14 @@ def create_list_of_json_strings(list_of_lists, super_delim=";"): return res, (super_delim.join(res)) + def run(): start_time = time.time() print("run() start: {}".format(str(datetime.datetime.now()))) - comm = MPI.COMM_WORLD # get MPI communicator object - size = comm.size # total number of processes - rank = comm.rank # rank of this process - status = MPI.Status() # get MPI status object + comm = MPI.COMM_WORLD # get MPI communicator object + size = comm.size # total number of processes + rank = comm.rank # rank of this process + status = MPI.Status() # get MPI status object print("ME rank is {}".format(rank)) instance = problem.Problem() @@ -70,28 +76,37 @@ def run(): eqpy.OUT_put("Params") # initial parameter set telling us the number of times to run the loop initparams = eqpy.IN_get() - (init_size, max_evals, num_workers, num_buffer, seed, max_threshold, n_jobs) = eval('{}'.format(initparams)) + (init_size, max_evals, num_workers, num_buffer, seed, max_threshold, + n_jobs) = eval("{}".format(initparams)) space = [spaceDict[key] for key in params] print(space) parDict = {} resultsList = [] - parDict['kappa'] = 1.96 + parDict["kappa"] = 1.96 # can set to num cores - parDict['n_jobs'] = n_jobs + parDict["n_jobs"] = n_jobs init_x = [] - opt = Optimizer(space, base_estimator='RF', acq_optimizer='sampling', - acq_func='LCB', acq_func_kwargs=parDict, random_state=seed) + opt = Optimizer( + space, + base_estimator="RF", + acq_optimizer="sampling", + acq_func="LCB", + acq_func_kwargs=parDict, + random_state=seed, + ) eval_counter = 0 askedDict = {} - print("Master starting with {} init_size, {} max_evals, {} num_workers, {} num_buffer, {} max_threshold".format(init_size,max_evals,num_workers,num_buffer, max_threshold)) + print( + "Master starting with {} init_size, {} max_evals, {} num_workers, {} num_buffer, {} max_threshold" + .format(init_size, max_evals, num_workers, num_buffer, max_threshold)) x = opt.ask(n_points=init_size) res, resstring = create_list_of_json_strings(x) print("Initial design is {}".format(resstring)) - for r,xx in zip(res,x): + for r, xx in zip(res, x): askedDict[r] = xx eqpy.OUT_put(resstring) currently_out = init_size @@ -101,11 +116,11 @@ def run(): group = comm.Get_group() # Assumes only one adlb_server # num_workers + 1 = num_turbine_workers - newgroup = group.Excl([num_workers+1]) - #print("ME newgroup size is {}".format(newgroup.size)) - newcomm = comm.Create_group(newgroup,1) + newgroup = group.Excl([num_workers + 1]) + # print("ME newgroup size is {}".format(newgroup.size)) + newcomm = comm.Create_group(newgroup, 1) nrank = newcomm.rank - #print("ME nrank is {}".format(nrank)) + # print("ME nrank is {}".format(nrank)) counter_threshold = 1 counter = 0 @@ -115,17 +130,17 @@ def run(): print("\neval_counter = {}".format(eval_counter)) data = newcomm.recv(source=MPI.ANY_SOURCE, status=status) counter = counter + 1 - xstring = data['x'] + xstring = data["x"] x = askedDict[xstring] - y = data['cost'] + y = data["cost"] if math.isnan(y): - y=sys.float_info.max + y = sys.float_info.max opt.tell(x, y) - #source = status.Get_source() - #tag = status.Get_tag() + # source = status.Get_source() + # tag = status.Get_tag() elapsed_time = float(time.time() - start_time) - print('elapsed_time:%1.3f'%elapsed_time) + print("elapsed_time:%1.3f" % elapsed_time) results.append(str(data)) eval_counter = eval_counter + 1 currently_out = currently_out - 1 @@ -142,25 +157,27 @@ def run(): counter_threshold = 1 print("counter_threshold: {}".format(counter_threshold)) - print("currently_out:{}, total_out:{}".format(currently_out,total_out)) - if currently_out < num_workers + num_buffer and total_out < max_evals and counter >= counter_threshold: + print("currently_out:{}, total_out:{}".format(currently_out, total_out)) + if (currently_out < num_workers + num_buffer and + total_out < max_evals and counter >= counter_threshold): n_points = counter if n_points + total_out > max_evals: n_points = max_evals - total_out ts = time.time() x = opt.ask(n_points=n_points) res, resstring = create_list_of_json_strings(x) - for r,xx in zip(res,x): + for r, xx in zip(res, x): askedDict[r] = xx eqpy.OUT_put(resstring) - print('point production elapsed_time:%1.3f' % float(time.time() - ts)) + print("point production elapsed_time:%1.3f" % + float(time.time() - ts)) currently_out = currently_out + n_points total_out = total_out + n_points counter = 0 end_iter_time = start_iter_time - print('Search finishing') + print("Search finishing") eqpy.OUT_put("DONE") eqpy.OUT_put(";".join(results)) diff --git a/workflows/async-search/python/utils.py b/workflows/async-search/python/utils.py index e787cf97..8cd5c4f2 100644 --- a/workflows/async-search/python/utils.py +++ b/workflows/async-search/python/utils.py @@ -1,22 +1,22 @@ -from string import Template -import re -import os -import sys -import time +import csv import json import math import os +import re import subprocess -import csv +import sys +import time +from string import Template + def saveResults(resultsList, json_fname, csv_fname): print(resultsList) print(json.dumps(resultsList, indent=4, sort_keys=True)) - with open(json_fname, 'w') as outfile: + with open(json_fname, "w") as outfile: json.dump(resultsList, outfile, indent=4, sort_keys=True) keys = resultsList[0].keys() - with open(csv_fname, 'w') as output_file: + with open(csv_fname, "w") as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(resultsList) diff --git a/workflows/async-search/swift/workflow.sh b/workflows/async-search/swift/workflow.sh index 92187eaa..60c3da05 100755 --- a/workflows/async-search/swift/workflow.sh +++ b/workflows/async-search/swift/workflow.sh @@ -117,10 +117,7 @@ cp $WORKFLOWS_ROOT/async-search/python/$PY_PACKAGE.py $CFG_SYS $CFG_PRM $TURBINE # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} -echo "OBJ_MODULE: $OBJ_MODULE" -echo "OBJ_DIR: $OBJ_DIR" - +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh @@ -131,6 +128,33 @@ then echo "Turbine will wait for job completion." fi +# Use for Summit (LSF needs two %) +if [[ ${SITE:-} == "summit" ]] +then + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" +else + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%r.txt" +fi + +mkdir -pv $TURBINE_OUTPUT/out + +#swift-t -n $PROCS \ +# -o $TURBINE_OUTPUT/workflow.tic \ +if [[ ${MACHINE:-} == "" ]] +then + STDOUT=$TURBINE_OUTPUT/output.txt + # The turbine-output link is only created on scheduled systems, + # so if running locally, we create it here so the test*.sh wrappers + # can find it + [[ -L turbine-output ]] && rm turbine-output + ln -s $TURBINE_OUTPUT turbine-output +else + # When running on a scheduled system, Swift/T automatically redirects + # stdout to the turbine-output directory. This will just be for + # warnings or unusual messages + STDOUT="" +fi + #export TURBINE_LAUNCH_OPTIONS="-cc none" swift-t -l -n $PROCS \ @@ -155,4 +179,17 @@ swift-t -l -n $PROCS \ -e MPICH_MAX_THREAD_SAFETY=$MPICH_MAX_THREAD_SAFETY \ -e TURBINE_MPI_THREAD=$TURBINE_MPI_THREAD \ $WAIT_ARG \ - $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} + $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} |& \ + tee $STDOUT + + +if (( ${PIPESTATUS[0]} )) +then + echo "workflow.sh: swift-t exited with error!" + exit 1 +fi + +# echo "EXIT CODE: 0" | tee -a $STDOUT + +# Andrew: Needed this so that script to monitor job worked properly (queue_wait... function in utils.sh?) +echo $TURBINE_OUTPUT > turbine-directory.txt diff --git a/workflows/async-search/swift/workflow_simple_obj_app.sh b/workflows/async-search/swift/workflow_simple_obj_app.sh index 1ad012e5..9e1ec741 100755 --- a/workflows/async-search/swift/workflow_simple_obj_app.sh +++ b/workflows/async-search/swift/workflow_simple_obj_app.sh @@ -110,7 +110,7 @@ cp $CFG_SYS $CFG_PRM $TURBINE_OUTPUT # Allow the user to set an objective function # OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -# OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +# OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh diff --git a/workflows/async-search/test/cfg-prm-1.sh b/workflows/async-search/test/cfg-prm-1.sh index 3f6b904a..e3e6b85a 100644 --- a/workflows/async-search/test/cfg-prm-1.sh +++ b/workflows/async-search/test/cfg-prm-1.sh @@ -3,11 +3,11 @@ # async-search settings # Note: INIT_SIZE needs to be larger than PROCS-2 for now. -INIT_SIZE=${INIT_SIZE:-500} -MAX_EVALS=${MAX_EVALS:-750} -NUM_BUFFER=${NUM_BUFFER:-250} -MAX_THRESHOLD=${MAX_THRESHOLD:-5} -N_JOBS=${N_JOBS:-48} +INIT_SIZE=${INIT_SIZE:-4} +MAX_EVALS=${MAX_EVALS:-20} +NUM_BUFFER=${NUM_BUFFER:-2} +MAX_THRESHOLD=${MAX_THRESHOLD:-1} +N_JOBS=${N_JOBS:-1} #INIT_SIZE=${INIT_SIZE:-300} #MAX_EVALS=${MAX_EVALS:-550} diff --git a/workflows/async-search/test/cfg-sys-1.sh b/workflows/async-search/test/cfg-sys-1.sh index dc10b22d..ad9a694b 100644 --- a/workflows/async-search/test/cfg-sys-1.sh +++ b/workflows/async-search/test/cfg-sys-1.sh @@ -4,7 +4,7 @@ # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEMS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-256} +export PROCS=${PROCS:-4} # MPI processes per node # Cori has 32 cores per node, 128GB per node diff --git a/workflows/async-search/test/test-1.sh b/workflows/async-search/test/test-1.sh index 0c487327..3544bbbd 100755 --- a/workflows/async-search/test/test-1.sh +++ b/workflows/async-search/test/test-1.sh @@ -48,7 +48,7 @@ export OBJ_RETURN="val_loss" # Set OBJ_DIR export OBJ_DIR=$EMEWS_PROJECT_ROOT/obj_folder -export OBJ_MODULE=obj_app +# export OBJ_MODULE= if [[ $SITE == "theta" ]] then @@ -58,21 +58,19 @@ fi # Submit job $EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM -# Check job output -TURBINE_OUTPUT=$( readlink turbine-output ) -OUTPUT=turbine-output/output.txt -WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) - # Wait for job queue_wait +cp $0 $TURBINE_OUTPUT +# Check job output +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + SCRIPT=$( basename $0 .sh ) -check_output "RESULTS:" $OUTPUT $WORKFLOW $SCRIPT $JOBID -check_output "EXIT CODE: 0" $OUTPUT $WORKFLOW $SCRIPT $JOBID +check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID echo "$SCRIPT: SUCCESS" - # Local Variables: # c-basic-offset: 2; # End: diff --git a/workflows/async-search/test/test-5K.sh b/workflows/async-search/test/test-5K.sh index 222caae9..0f6e0300 100755 --- a/workflows/async-search/test/test-5K.sh +++ b/workflows/async-search/test/test-5K.sh @@ -48,7 +48,7 @@ export OBJ_RETURN="val_loss" # Set OBJ_DIR export OBJ_DIR=$EMEWS_PROJECT_ROOT/obj_folder -export OBJ_MODULE=obj_app +# export OBJ_MODULE= if [[ $SITE == "theta" ]] then @@ -58,17 +58,16 @@ fi # Submit job $EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM -# Check job output -TURBINE_OUTPUT=$( readlink turbine-output ) -OUTPUT=turbine-output/output.txt -WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) - # Wait for job queue_wait +cp $0 $TURBINE_OUTPUT +# Check job output +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + SCRIPT=$( basename $0 .sh ) -check_output "RESULTS:" $OUTPUT $WORKFLOW $SCRIPT $JOBID -check_output "EXIT CODE: 0" $OUTPUT $WORKFLOW $SCRIPT $JOBID +check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID echo "$SCRIPT: SUCCESS" diff --git a/workflows/cmp-cv/py/compare.py b/workflows/cmp-cv/py/compare.py new file mode 100644 index 00000000..a9098aa9 --- /dev/null +++ b/workflows/cmp-cv/py/compare.py @@ -0,0 +1,203 @@ +"""This script can be used to filter a subset of the test set based on the +properties of the drug molecules. + +For example, here we can select the molecules of which the 'prop' is +between two values (provided in the 2nd and 3rd elements of each list in +the conditions list. We can then find the prediction errors for this +domain. Knowledge of the errors of differnt molecular groups is helpful +to understand the currrent deficiencies of the drug response models (or +any molecular property prediction model in general). This knowledge is +then allow us to improve the models as well as use predictions from the +models which produce highly accurate preidictions for certain domains. +""" + +import os +import pandas as pd +import pandas as pd +import numpy as np +from sklearn.metrics import mean_squared_error + +# conditions = pd.DataFrame( +# [['nAromAtom', 5, 10], ['nAtom', 20, 50], ['BertzCT', 800, 1000]], +# columns=['prop', 'low', 'high']) +# case 2 +conditions = pd.DataFrame( + [ + ['nAtom', 8, 28],['nAtom', 28, 48],['nAtom', 48, 67],['nAtom', 67, 87],['nAtom', 87, 106],['nAtom', 106, 125], + ['nAtom', 125, 145],['nAtom', 145, 164],['nAtom', 164, 184],['nAtom', 184, 203],['nAtom', 203, 222], + ['nAtom', 222, 242],['nAtom', 242, 261],['nAtom', 261, 281],['nAtom', 281, 300],['nAtom', 300, 319], + ['nAtom', 319, 339],['nAtom', 339, 358],['nAtom', 358, 378],['nAtom', 378, 397],['nAtom', 397, 416], + ['nAtom', 416, 436],['nAtom', 436, 455],['nAtom', 455, 494], + ['nAromAtom', 0, 3],['nAromAtom', 3, 6],['nAromAtom', 6, 10],['nAromAtom', 10, 13], + ['nAromAtom', 13, 16],['nAromAtom', 16, 19],['nAromAtom', 19, 22],['nAromAtom', 22, 26], + ['nAromAtom', 26, 29],['nAromAtom', 29, 32],['nAromAtom', 32, 35],['nAromAtom', 35, 38], + ['nAromAtom', 38, 42],['nAromAtom', 42, 45],['nAromAtom', 45, 48], + ['nRing', 0, 2],['nRing', 2, 3],['nRing', 3, 5],['nRing', 5, 6], + ['nRing', 6, 8],['nRing', 8, 10],['nRing', 10, 11],['nRing', 11, 13], + ['nRing', 38, 42],['nRing', 42, 45],['nRing', 45, 48], + ['nAcid', 0, 1],['nAcid', 1, 2],['nAcid', 2, 3],['nAcid', 3, 4], + ['BertzCT', 7.50964047e+00, 9.80918522e+02], ['BertzCT', 9.80918522e+02, 1.95422740e+03], + ['BertzCT', 1.95422740e+03, 2.92753628e+03],['BertzCT', 2.92753628e+03, 3.90084517e+03], + ['BertzCT', 3.90084517e+03, 4.87415405e+03],['BertzCT', 4.87415405e+03, 5.84746293e+03], + ['BertzCT', 5.84746293e+03, 6.82077181e+03],['BertzCT', 6.82077181e+03, 7.79408069e+03], + ['BertzCT', 7.79408069e+03, 8.76738957e+03],['BertzCT', 8.76738957e+03, 9.74069845e+03], + ['nRot', 0, 10],['nRot', 10, 19],['nRot', 19, 29],['nRot', 29, 38],['nRot', 38, 48], + ['nRot', 48, 58],['nRot', 58, 67],['nRot', 67, 77],['nRot', 77, 86],['nRot', 86, 96] + ], + columns=['prop', 'low', 'high']) + +# from cmp_utils import conditions, Benchmark + +CANDLE_DATA_DIR = os.getenv("CANDLE_DATA_DIR") + + +def compare(model_name, exp_id, run_id): + cmp_results = {} + print(f"compare: run_id={run_id}") + # gParams = read_params(exp_id, run_id) + # model = gParams("model_name") + + # model = "DrugCell" # TODO: Hardcoded. have to get this from output dir? + # turbine_output = os.getenv("TURBINE_OUTPUT") + + CANDLE_DATA_DIR = os.getenv("CANDLE_DATA_DIR") + outdir = os.path.join(CANDLE_DATA_DIR, model_name, "Output", exp_id, run_id) + directory = outdir + # directory = f"{CANDLE_DATA_DIR}/Output/{exp_id}/{run_id}" + print("reading the predictions....") + df_res = pd.read_csv(f"{directory}/test_predictions.csv") + + # a class to calculate errors for subsets of the validation/test set + print("reading the drug feature file....") + # TODO: Should have to save the above file in this file + # copy and place the following in your CANDLE_DATA_DIR + # cp /lambda_stor/homes/ac.gpanapitiya/ccmg-mtg/benchmark/drug_features.csv . + # bmk = Benchmark(fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv' + # ) # TODO: have to have a drug features for a common test set + # subset_err, final_domain_err = bmk.error_by_feature_domains_model( + # df_res, conditions) + + # # or this + fp_path=f'{CANDLE_DATA_DIR}/drug_features.csv' + subset_err, final_domain_err = error_by_feature_domains_model(fp_path, df_res, conditions) + + # collect results for comparison + # cmp_prop = 'nAtom' # TODO: Get this from gParameters + # subset_err.set_index( + # 'prop', inplace=True + # ) # TODO: use 'prop' as a parameter and move it to cmp_models.txt + # cmp_results[run_id] = subset_err.loc[ + # cmp_prop, + # 'error'] # this is the property based on which we want to do the comparison + cmp_results[run_id] = -1 # for case 2, this is not defined + # # cmp_results[run_id] = -1 # set to -1 for now as we don't have the drug features file + # with open(f"{directory}/subset_err.txt", "w") as fp: + # fp.write(str(cmp_results[run_id])) + + subset_err.to_csv(f"{directory}/domain_err.csv", index=False) + + return str(cmp_results[run_id]) + + +def error_by_feature_domains_model(fp_path, preds, conditions): + + fps = pd.read_csv(fp_path) + report = [] + preds['err'] = abs(preds['true'] - preds['pred']) + keep = preds.copy() + for i in range(conditions.shape[0]): + + prop = conditions.loc[i, 'prop'] + low = conditions.loc[i, 'low'] + high = conditions.loc[i, 'high'] + + locs = np.logical_and(fps[prop] <= high, fps[prop] > low) + smiles = fps.loc[locs, 'smiles'].values + tmp = preds[preds.smiles.isin(smiles)] + mean_err = tmp.err.mean() + + report.append([prop, low, high, mean_err]) + + keep = keep[keep.smiles.isin(smiles)] # this is in case we want to progressively + # consider domains. A domain composed of multiple domains + + final_domain_err = keep.err.mean() # return this + report = pd.DataFrame(report, columns=['prop', 'low', 'high', 'error']) + return report, final_domain_err + + +class Benchmark: + + def __init__(self, fp_path): + + self.fps = pd.read_csv(fp_path) + # self.model_preds = model_preds + # self.feature_conditions = feature_conditions + self.reports = {} + + def error_by_feature_domains_model(self, preds, conditions): + + fps = self.fps + report = [] + preds['err'] = abs(preds['true'] - preds['pred']) + keep = preds.copy() + for i in range(conditions.shape[0]): + + prop = conditions.loc[i, 'prop'] + low = conditions.loc[i, 'low'] + high = conditions.loc[i, 'high'] + + locs = np.logical_and(fps[prop] <= high, fps[prop] > low) + smiles = fps.loc[locs, 'smiles'].values + tmp = preds[preds.smiles.isin(smiles)] + mean_err = tmp.err.mean() + + report.append([prop, low, high, mean_err]) + + keep = keep[keep.smiles.isin(smiles)] + + final_domain_err = keep.err.mean() # return this + report = pd.DataFrame(report, columns=['prop', 'low', 'high', 'error']) + return report, final_domain_err + + def error_by_feature_domains(self, feature_conditions): + + results = [] + for model_name, pred in self.model_preds.items(): + + report = self.error_by_feature_domains_model( + pred, feature_conditions) + report.loc[:, 'model'] = model_name + results.append(report) + + results = pd.concat(results, axis=0) + results = results.loc[:, ['model', 'prop', 'low', 'high', 'error']] + results.reset_index(drop=True, inplace=True) + + return results + + def rank_by_acc(self, metric='rmse', th=3): + + results = {} + for model_name, pred in self.model_preds.items(): + sub = pred[pred.labels > th] + rmse = mean_squared_error(y_true=sub['labels'], + y_pred=sub['preds'])**.5 + + results[model_name] = {'rmse': rmse} + + results = pd.DataFrame.from_dict(results) + results = results.T + return results + + +def create_grid_files(): + + dc_grid = {'epochs': [1, 2], 'lr': [1e-2, 1e-3]} + sw_grid = {'epochs': [3, 4], 'lr': [1e-2, 1e-5]} + + with open('DrugCell_grid.json', 'w') as fp: + json.dump(dc_grid, fp) + + with open('SWnet_CCLE_grid.json', 'w') as fp: + json.dump(sw_grid, fp) diff --git a/workflows/cmp-cv/swift/workflow.sh b/workflows/cmp-cv/swift/workflow.sh new file mode 100755 index 00000000..9a2b32d9 --- /dev/null +++ b/workflows/cmp-cv/swift/workflow.sh @@ -0,0 +1,116 @@ +#! /usr/bin/env bash +set -eu + +# CMP-CV WORKFLOW SH + +# Autodetect this workflow directory +export CANDLE_PROJECT_ROOT=$( realpath $( dirname $0 )/.. ) +export WORKFLOWS_ROOT=$( realpath $CANDLE_PROJECT_ROOT/.. ) + +SCRIPT_NAME=$(basename $0) + +# Source some utility functions used in this script +source $WORKFLOWS_ROOT/common/sh/utils.sh + +usage() +{ + echo "CMP-CV: usage: workflow.sh SITE EXPID CFG_SYS PLAN" +} + +if (( ${#} != 5 )) +then + usage + exit 1 +fi + +if ! { + # Sets SITE + # Sets EXPID, TURBINE_OUTPUT + # Sets CFG_SYS + # PLAN is the hyperparameter list file + get_site $1 && \ + get_expid $2 && \ + get_cfg_sys $3 && \ + UPF=$4 + MODELS=$5 + } +then + usage + exit 1 +fi + +source_site env $SITE +source_site sched $SITE + +# Set up PYTHONPATH for model +source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh + +export PYTHONPATH="${PYTHONPATH}:$WORKFLOWS_ROOT/cmp-cv/py" +log_path PYTHONPATH + +export TURBINE_JOBNAME="CMP_${EXPID}" + +export MODEL_SH=${MODEL_SH:-$WORKFLOWS_ROOT/common/sh/model.sh} +export BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-21600} # 6 hours +PLAN="PLAN_NOT_DEFINED" +CMD_LINE_ARGS=( -expid=$EXPID + -benchmark_timeout=$BENCHMARK_TIMEOUT + -plan=$PLAN + -models=$MODELS + -gparams=$UPF + ) + +USER_VARS=( $CMD_LINE_ARGS ) +# log variables and script to to TURBINE_OUTPUT directory +log_script + +# Copy settings to TURBINE_OUTPUT for provenance +cp $CFG_SYS $TURBINE_OUTPUT + +# Make run directory in advance to reduce contention +mkdir -pv $TURBINE_OUTPUT/run + +cp -v $UPF $TURBINE_OUTPUT + +TURBINE_STDOUT="$TURBINE_OUTPUT/out-%%r.txt" +# TURBINE_STDOUT= + +if [[ ${CANDLE_DATA_DIR:-} == "" ]] +then + abort "cmp-cv workflow.sh: Set CANDLE_DATA_DIR!" +fi + +export CANDLE_IMAGE=${CANDLE_IMAGE:-} + +export CANDLE_MODEL_IMPL=container + +which swift-t + +swift-t -n $PROCS \ + -o $TURBINE_OUTPUT/workflow.tic \ + ${MACHINE:-} \ + -p \ + -I $WORKFLOWS_ROOT/common/swift \ + -i model_$CANDLE_MODEL_IMPL \ + -e BENCHMARKS_ROOT \ + -e CANDLE_PROJECT_ROOT \ + -e MODEL_SH \ + -e FI_MR_CACHE_MAX_COUNT=0 \ + -e SITE \ + -e BENCHMARK_TIMEOUT \ + -e MODEL_NAME=${MODEL_NAME:-MODEL_NULL} \ + -e MODEL_RETURN \ + -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ + -e TURBINE_MPI_THREAD=${TURBINE_MPI_THREAD:-1} \ + $( python_envs ) \ + -e TURBINE_STDOUT=$TURBINE_STDOUT \ + -e CANDLE_MODEL_TYPE \ + -e CANDLE_IMAGE \ + $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} + +# Can provide this to debug Python settings: +# -e PYTHONVERBOSE=1 +# Can provide this if needed for debugging crashes: +# -e PYTHONUNBUFFERED=1 +# Can provide this if needed to reset PATH: +# -e PATH=$PATH diff --git a/workflows/cmp-cv/swift/workflow.swift b/workflows/cmp-cv/swift/workflow.swift new file mode 100644 index 00000000..6eec2dd1 --- /dev/null +++ b/workflows/cmp-cv/swift/workflow.swift @@ -0,0 +1,94 @@ + +/** + CMP-CV WORKFLOW.SWIFT +*/ + +import assert; +import io; +import json; +import files; +import string; +import sys; + +import candle_utils; +// report_env(); + +string FRAMEWORK = "pytorch"; + +// Scan command line +// file plan = input(argv("plan")); +file model_file = input(argv("models")); +file gparams_file = input(argv("gparams")); +int benchmark_timeout = string2int(argv("benchmark_timeout", "-1")); + +string expid = getenv("EXPID"); +string turbine_output = getenv("TURBINE_OUTPUT"); +string candle_model_type = getenv("CANDLE_MODEL_TYPE"); + +// Report some key facts: +printf("CMP-CV: %s", filename(model_file)); +system1("date \"WORKFLOW START: +%Y-%m-%d %H:%M\""); + +// Read unrolled parameter file +// string plan_lines[] = file_lines(plan); +string model_lines[] = file_lines(model_file); + +string gparams_lines[] = file_lines(gparams_file); + +// Resultant output values: +string results[]; +// string run_ids[]; + +// compare(string exp_id, string run_id) +// { +// python_persist("import compare", +// "compare.compare(\"%s\", \"%s\")") % (exp_id, run_id); +// } + +compare(string model_name, string expid, string runid) +{ + printf("Calling compare with model_name: %s", model_name)=> + python_persist("import compare", "compare.compare(\"%s\", \"%s\", \"%s\")" % (model_name, expid, runid) ); + // python_persist("import compare", "compare.compare()"); +} + +// Evaluate each parameter set +// foreach model, i in model_lines +// { +foreach gparam, j in gparams_lines +{ + // runid = i*1000000 + j; + runid = j; + + printf("runid: %s", runid); + // printf("model: %s", model); + + // printf("model: %s", model); + // m = "\"model_name\": \"%s\"" % model; + + // gparams = replace(gparam, "MORE_PARAMS", m, 0); + printf("gparams: %s", gparam); + // printf("GPARAMS: %s", gparams); + model_name = json_get(gparam, "model_name"); + candle_image = json_get(gparam, "candle_image"); + printf("MODEL: %s", model_name); + // printf(gparams); + // results[runid] = obj(gparam, expid, repr(runid) ); + model_script = "train.sh"; + results[runid] = // obj_container(gparam, expid, repr(runid), + // model_name, candle_image, model_script) => + candle_model_train(gparam, expid, repr(runid), + candle_image) => + + compare(model_name, expid, repr(runid)); + // results[runid] = obj(gparam, expid, repr(runid)); + // => compare(expid, repr(runid) ); + + // assert(results[i] != "EXCEPTION", "exception in obj()!"); +} +// } + +// // Join all result values into one big semicolon-delimited string +// string result = join(run_ids, ";"); +// // and print it +// printf("WORKFLOW RESULT: " + result); diff --git a/workflows/cmp-cv/test/cfg-sys-1.sh b/workflows/cmp-cv/test/cfg-sys-1.sh new file mode 100644 index 00000000..5158b14a --- /dev/null +++ b/workflows/cmp-cv/test/cfg-sys-1.sh @@ -0,0 +1,27 @@ + +# CMP-CV CFG SYS 1 + +# Use 1 for interactive workflows +# export INTERACTIVE=1 + +# The number of MPI processes +# Note that 1 process is reserved for Swift/T +# For example, if PROCS=4 that gives you 3 workers, +# i.e., 3 concurrent Keras runs. +export PROCS=${PROCS:-2} + +# MPI processes per node. This should not exceed PROCS. +# Cori has 32 cores per node, 128GB per node +export PPN=${PPN:-2} + +#export QUEUE=${QUEUE:-batch} + +export WALLTIME=${WALLTIME:-1:00:00} +echo WALLTIME: $WALLTIME + +# export MAIL_ENABLED=1 +# export MAIL_ADDRESS=woz@anl.gov + +# Benchmark run timeout: benchmark run will timeouT +# after the specified number of seconds. -1 is no timeout. +BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} diff --git a/workflows/cmp-cv/test/make-upf-1.sh b/workflows/cmp-cv/test/make-upf-1.sh new file mode 100755 index 00000000..eb5c0049 --- /dev/null +++ b/workflows/cmp-cv/test/make-upf-1.sh @@ -0,0 +1,26 @@ +#!/bin/zsh + + + +OUTPUT=$1 + +# Use ZSH for range operation + +EPOCHS_MIN=10 +EPOCHS_MAX=20 +BATCH_SIZE_MIN=5 +BATCH_SIZE_MAX=7 + + +for EPOCHS in {$EPOCHS_MIN..$EPOCHS_MAX} +do + for BATCH_SIZE in {$BATCH_SIZE_MIN..$BATCH_SIZE_MAX} + do + BS2=$(( 2 ** BATCH_SIZE )) + echo "{" + echo "\"epochs\": $EPOCHS," + echo "\"batch_size\": $BATCH_SIZE," + echo "MORE_PARAMS" + echo "}" + done +done > $OUTPUT diff --git a/workflows/cmp-cv/test/models-1.txt b/workflows/cmp-cv/test/models-1.txt new file mode 100644 index 00000000..650f4e0b --- /dev/null +++ b/workflows/cmp-cv/test/models-1.txt @@ -0,0 +1,3 @@ +DrugCell +# SWnet +# tCNN diff --git a/workflows/cmp-cv/test/plan-small-1.txt b/workflows/cmp-cv/test/plan-small-1.txt new file mode 100644 index 00000000..618fa71c --- /dev/null +++ b/workflows/cmp-cv/test/plan-small-1.txt @@ -0,0 +1,2 @@ +# PLAN SMALL 1 TXT +{ "hyperparam1": "value1", ... } diff --git a/workflows/cmp-cv/test/test-small-1.sh b/workflows/cmp-cv/test/test-small-1.sh new file mode 100755 index 00000000..0929c4ee --- /dev/null +++ b/workflows/cmp-cv/test/test-small-1.sh @@ -0,0 +1,28 @@ +#!/bin/bash +set -eu + +# CMP-CV TEST SMALL 1 + +if (( ${#} != 1 )) +then + echo "usage: test SITE" + exit 1 +fi + +# export MODEL_NAME=$1 +SITE=$1 + +# Self-configure +THIS=$( cd $( dirname $0 ) ; /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. ; /bin/pwd ) +WORKFLOWS_ROOT=$( cd $THIS/../.. ; /bin/pwd ) +export EMEWS_PROJECT_ROOT + +export MODEL_RETURN="val_loss" +CFG_SYS=$THIS/cfg-sys-1.sh + +# export MODEL_NAME="DrugCell" +# export CANDLE_IMAGE=/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif +export CANDLE_MODEL_TYPE="SINGULARITY" + +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $THIS/upf-1.txt $THIS/models-1.txt diff --git a/workflows/cmp-cv/test/upf-1.txt b/workflows/cmp-cv/test/upf-1.txt new file mode 100644 index 00000000..24e024bb --- /dev/null +++ b/workflows/cmp-cv/test/upf-1.txt @@ -0,0 +1,27 @@ + +{"id": "RUN001", "epochs": 1, "model_name": "DrugCell", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/DrugCell.sif"} +{"id": "RUN002", "epochs": 2, "model_name": "DrugCell", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/DrugCell.sif"} +{"id": "RUN003", "epochs": 1, "model_name": "SWnet", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} +{"id": "RUN004", "epochs": 2, "model_name": "SWnet", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} + +#GraphDRP +#{"id": "RUN004", "training_data" : "path/to/dir, +# "testing_data": "path/to/dir", +# "infer_data" : [ path/to/dir ], +# "model_params": name_of_model_params_output_of_training, +# "epochs": 50, +# "model_name": "GraphDRP", +# "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} +#{"id": "RUN004", "epochs": 50, "model_name": "GraphDRP", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} +#{"id": "RUN004", "epochs": 50, "model_name": "GraphDRP", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} +#{"id": "RUN004", "epochs": 50, "model_name": "GraphDRP", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} +#{"id": "RUN004", "epochs": 50, "model_name": "DeepTTC", "candle_image": "/lambda_stor/homes/ac.knutson/ccmg-mtg/Singularity/build/SWnet.sif"} +#DeepTTC + +# # NOTE: Everything after "candle_image" is stripped! +# # Insert new parameters before "candle_image" +# {"id": "RUN000", "epochs": 1, "model_name": "DrugCell", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif"} +# {"id": "RUN001", "epochs": 1, "model_name": "DrugCell", "candle_image": "/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif"} +# {"id": "RUN002", "epochs": 1, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} +# {"id": "RUN003", "epochs": 1, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} +# {"id": "RUN004", "epochs": 1, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} diff --git a/workflows/common/R/README.adoc b/workflows/common/R/README.adoc new file mode 100644 index 00000000..809bc44f --- /dev/null +++ b/workflows/common/R/README.adoc @@ -0,0 +1,20 @@ + +Run install-candle.sh to set up R for CANDLE HPO via mlrMBO. + +Unstructured historical notes follow. + +# Force Plotly 4.5.6 - not latest! Do not want shiny/httpuv, it does not work on Cooley! +#install.packages("https://cran.r-project.org/src/contrib/Archive/plotly/plotly_4.5.6.tar.gz") # This dies with a dependency error but plotly is installed anyway as a dependency of the following packages, so I'm putting it back into the PKGS list (ALW, 9/29/20) +#install.packages("https://cran.r-project.org/src/contrib/Archive/jsonlite/jsonlite_1.7.0.tar.gz") # ALW adding this on 9/12/20 (and removing jsonlite from PKGS list below) because sometime in the first two weeks of Sept 2020 the default jsonlite version became 1.7.1 and this seems to throw an error that looks to me like a bug that should be fixed with time; e.g., while everything worked in early Sept 2020 (probably 9/2/20), now on 9/12/20 I get this error: +# * DONE (jsonlite) +# 1): succeeded '/usr/local/apps/R/4.0/4.0.0/lib64/R/bin/R CMD INSTALL -l '/gpfs/gsfs9/users/BIDS-HPC/public/software/distributions/candle/dev_2/builds/R/libs' '/lscratch/64803361/Rtmpnd5yDC/downloaded_packages/jsonlite_1.7.1.tar.gz'' +# The downloaded source packages are in +# /lscratch/64803361/Rtmpnd5yDC/downloaded_packages +# [1] "" +# LOAD: jsonlite +# Error in value[[3L]](cond) : +# Package 'jsonlite' version 1.7.0 cannot be unloaded: +# Error in unloadNamespace(package) : namespace jsonlite is imported by plotly so cannot be unloaded +# Calls: library ... tryCatch -> tryCatchList -> tryCatchOne -> +# Execution halted +# ****NOTE**** that I tried installing both plotly and jsonlite the normal way (in the PKGS below instead of a specific version above) and I got the same error diff --git a/workflows/common/R/install-candle.R b/workflows/common/R/install-candle.R index 51ade33b..6d895197 100644 --- a/workflows/common/R/install-candle.R +++ b/workflows/common/R/install-candle.R @@ -3,27 +3,28 @@ # Run this via install-candle.sh # Installs all R packages needed for Supervisor workflows - # mlrMBO may need APT packages libxml2-dev libssl-dev curl-dev -NCPUS = 16 - +# Installation settings: r <- getOption("repos") +# Change this mirror as needed: # r["CRAN"] <- "http://cran.cnr.berkeley.edu/" r["CRAN"] <- "http://cran.wustl.edu/" options(repos = r) +NCPUS = 8 -# Force Plotly 4.5.6 - not latest! Do not want shiny/httpuv, it does not work on Cooley! -install.packages("https://cran.r-project.org/src/contrib/Archive/plotly/plotly_4.5.6.tar.gz") - -PKGS=( - "smoof", +# Do plotly early in the list: It requires OpenSSL and Curl headers +# which may not be available. +PKGS <- list( + "RInside", + "plotly", + "jsonlite", "rgenoud", "DiceKriging", - "randomForest" - "jsonlite", + # not available for R 3.6.1 : needed for mlrMBO HPO: + "randomForest", "parallelMap", - "RInside", + # requires smoof requires misc3d requires --with-tcltk : "mlrMBO" ) diff --git a/workflows/common/R/install-candle.sh b/workflows/common/R/install-candle.sh index 27f70414..afaa537e 100755 --- a/workflows/common/R/install-candle.sh +++ b/workflows/common/R/install-candle.sh @@ -1,12 +1,14 @@ -#!/bin/sh +#!/bin/bash set -eu # INSTALL CANDLE R # Installs all R packages needed for Supervisor workflows -# pass CONFIRM=0 via command line for by passing options, default is CONFIRM=1 +# pass CONFIRM=0 via command line for by passing options, +# default is CONFIRM=1 : ${CONFIRM:=1} + while getopts ":y" OPTION do case $OPTION in @@ -21,9 +23,30 @@ done echo "This will install multiple R packages for CANDLE." echo -echo "using R: $( which R )" -echo "using gcc: $( which gcc )" -echo "using gfortran: $( which gfortran )" + +if ! command which R > /dev/null +then + echo "No R found!" + exit 1 +fi + +echo "variables:" +set +u # These variables may be unset +for var in CC CXX FC +do + printf "using %-8s = %s\n" $var ${!var} +done +echo +set -u + +echo "tools:" +for tool in R cc CC gcc g++ ftn gfortran +do + if command which $tool > /dev/null 2>&1 + then + printf "using %-10s %s\n" "${tool}:" $( which $tool ) + fi +done echo if [ $CONFIRM = 1 ] @@ -34,4 +57,4 @@ then fi THIS=$( dirname $0 ) -nice R -f $THIS/install-candle.R +nice R -f $THIS/install-candle.R |& tee install-candle.log diff --git a/workflows/common/R/mlrMBO-default.R b/workflows/common/R/mlrMBO-default.R index 2774704d..5bd52e9f 100644 --- a/workflows/common/R/mlrMBO-default.R +++ b/workflows/common/R/mlrMBO-default.R @@ -70,26 +70,26 @@ restart.file) { print("Using randomForest") - surr.rf = makeLearner("regr.randomForest", - predict.type = "se", + surr.rf = makeLearner("regr.randomForest", + predict.type = "se", fix.factors.prediction = TRUE) #mtry = 6, #se.method = "bootstrap", se.boot = 50, se.ntree = 100) - ctrl = makeMBOControl(n.objectives = 1, + ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$double.xmax, trafo.y.fun = makeMBOTrafoFunction('log', log)) - ctrl = setMBOControlInfill(ctrl, + ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritCB(), - opt.restarts = 1, + opt.restarts = 1, opt.focussearch.points = 1000) - ctrl = setMBOControlTermination(ctrl, - max.evals = max.budget, + ctrl = setMBOControlTermination(ctrl, + max.evals = max.budget, iters = max.iterations) chkpntResults<-NULL # TODO: Make this an argument - restartFile<-restart.file + restartFile<-restart.file if (file.exists(restart.file)) { print(paste("Loading restart:", restart.file)) diff --git a/workflows/common/R/mlrMBO-ls1.R b/workflows/common/R/mlrMBO-ls1.R index 63548d75..4e8360b4 100644 --- a/workflows/common/R/mlrMBO-ls1.R +++ b/workflows/common/R/mlrMBO-ls1.R @@ -68,16 +68,16 @@ main_function <- function(max.budget = 110, max.iterations = 10, design.size=10, propose.points=10, - restart.file="DISABLED", + restart.file="DISABLED", learner1.name = "randomForest") { if (learner1.name == "km"){ print("Using Kriging.") - surr.rf = makeLearner("regr.km", predict.type = "se") #covtype = "matern3_2", control = list(trace = FALSE)) + surr.rf = makeLearner("regr.km", predict.type = "se") #covtype = "matern3_2", control = list(trace = FALSE)) - #TODO: Avoid error: + #TODO: Avoid error: # [mbo] 3: latent_dim=2; batch_size=35; learning_rate=0.0762; epochs=8 : y = 0.203 : 29.6 secs : infill_cb - # Error in chol.default(R) : + # Error in chol.default(R) : # the leading minor of order 29 is not positive definite # The issue is mentioned here: https://github.com/mlr-org/mlrMBO/issues/80 # y = MyTrainingData$MyTarget @@ -86,7 +86,7 @@ main_function <- function(max.budget = 110, ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$integer.max * 0.1) - + # y = MyTrainingData$MyTarget # Nuggets = 1e-8*var(y) # setHyperPars(learner = surr.rf, nugget=Nuggets) @@ -98,15 +98,15 @@ main_function <- function(max.budget = 110, } else if (learner1.name == "randomForest"){ print("Using randomForest") - surr.rf = makeLearner("regr.randomForest", predict.type = "se", + surr.rf = makeLearner("regr.randomForest", predict.type = "se", fix.factors.prediction = TRUE, - se.method = "bootstrap", + se.method = "bootstrap", se.boot = 2, se.ntree = 10, ntree=1000, mtry=8) - ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, + ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$integer.max * 0.1 ) ctrl = setMBOControlTermination(ctrl, max.evals = propose.points) - ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritEI(se.threshold = 0.0), + ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritEI(se.threshold = 0.0), opt.restarts = 1, opt.focussearch.points = 1000) } else{ @@ -117,7 +117,7 @@ main_function <- function(max.budget = 110, chkpntResults<-NULL # TODO: Make this an argument - restartFile<-restart.file + restartFile<-restart.file if (file.exists(restart.file)) { print(paste("Loading restart:", restart.file)) @@ -168,29 +168,29 @@ main_function <- function(max.budget = 110, #iterative phase starts while (itr < max_itr){ - + print(sprintf("nevals = %03d", nrow(all_res))) min.index<-which(itr_res$y==min(itr_res$y)) - + par.set.t = par.set0 pars = par.set.t$pars lens = getParamLengths(par.set.t) k = sum(lens) pids = getParamIds(par.set.t, repeated = TRUE, with.nr = TRUE) - + snames = c("y", pids) reqDF = subset(itr_res, select = snames, drop =TRUE) bestDF <- reqDF[min.index,] print("reqDF") print(nrow(reqDF)) print(summary(reqDF)) - + train.model <- randomForest(y ~ ., data=reqDF, ntree=100000, keep.forest=TRUE, importance=TRUE) var.imp <- importance(train.model, type = 1) index <- sort(abs(var.imp[,1]), decreasing = TRUE, index.return = TRUE)$ix - + inputs <- rownames(var.imp)[index] scores <- abs(var.imp[index,1]) norm.scores <- 100 * scores / sum(scores) @@ -200,7 +200,7 @@ main_function <- function(max.budget = 110, rnames <- inputs[remove.index] print('removing:') print(rnames) - + par.set1<-par.set0 pnames<-names(par.set$pars) print(par.set1) @@ -233,7 +233,7 @@ main_function <- function(max.budget = 110, temp<-rbind(design,reqDF[,-1]) design <- head(temp, n = propose.points) yvals <- predict(train.model,design) - + USE_MODEL <- TRUE if(USE_MODEL){ design <- cbind(y=yvals, design) @@ -245,12 +245,12 @@ main_function <- function(max.budget = 110, res = mbo(obj.fun, design = design, learner = surr.rf, control = ctrl, show.info = TRUE) itr_res<-as.data.frame(res$opt.path) itr_res<-tail(itr_res, n = propose.points) - + par.set0<-par.set1 itr <- itr + 1 all_res <- rbind(all_res, itr_res) } - + return(all_res) } diff --git a/workflows/common/R/mlrMBO-ls2.R b/workflows/common/R/mlrMBO-ls2.R index dd7be142..65c0ffc0 100644 --- a/workflows/common/R/mlrMBO-ls2.R +++ b/workflows/common/R/mlrMBO-ls2.R @@ -68,16 +68,16 @@ main_function <- function(max.budget = 110, max.iterations = 10, design.size=10, propose.points=10, - restart.file="DISABLED", + restart.file="DISABLED", learner1.name = "randomForest") { if (learner1.name == "km"){ print("Using Kriging.") - surr.rf = makeLearner("regr.km", predict.type = "se") #covtype = "matern3_2", control = list(trace = FALSE)) + surr.rf = makeLearner("regr.km", predict.type = "se") #covtype = "matern3_2", control = list(trace = FALSE)) - #TODO: Avoid error: + #TODO: Avoid error: # [mbo] 3: latent_dim=2; batch_size=35; learning_rate=0.0762; epochs=8 : y = 0.203 : 29.6 secs : infill_cb - # Error in chol.default(R) : + # Error in chol.default(R) : # the leading minor of order 29 is not positive definite # The issue is mentioned here: https://github.com/mlr-org/mlrMBO/issues/80 # y = MyTrainingData$MyTarget @@ -86,7 +86,7 @@ main_function <- function(max.budget = 110, ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$integer.max * 0.1) - + # y = MyTrainingData$MyTarget # Nuggets = 1e-8*var(y) # setHyperPars(learner = surr.rf, nugget=Nuggets) @@ -98,15 +98,15 @@ main_function <- function(max.budget = 110, } else if (learner1.name == "randomForest"){ print("Using randomForest") - surr.rf = makeLearner("regr.randomForest", predict.type = "se", + surr.rf = makeLearner("regr.randomForest", predict.type = "se", fix.factors.prediction = TRUE, - se.method = "bootstrap", + se.method = "bootstrap", se.boot = 2, se.ntree = 10, ntree=1000, mtry=8) - ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, + ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$integer.max * 0.1 ) ctrl = setMBOControlTermination(ctrl, max.evals = propose.points) - ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritEI(se.threshold = 0.0), + ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritEI(se.threshold = 0.0), opt.restarts = 1, opt.focussearch.points = 1000) } else{ @@ -117,7 +117,7 @@ main_function <- function(max.budget = 110, chkpntResults<-NULL # TODO: Make this an argument - restartFile<-restart.file + restartFile<-restart.file if (file.exists(restart.file)) { print(paste("Loading restart:", restart.file)) @@ -167,28 +167,28 @@ main_function <- function(max.budget = 110, #iterative phase starts while (itr < max_itr){ - + min.index<-which(itr_res$y==min(itr_res$y)) - + par.set.t = par.set0 pars = par.set.t$pars lens = getParamLengths(par.set.t) k = sum(lens) pids = getParamIds(par.set.t, repeated = TRUE, with.nr = TRUE) - + snames = c("y", pids) reqDF = subset(itr_res, select = snames, drop =TRUE) bestDF <- reqDF[min.index,] print("reqDF") print(nrow(reqDF)) print(summary(reqDF)) - + train.model <- randomForest(y ~ ., data=reqDF, ntree=100000, keep.forest=TRUE, importance=TRUE) var.imp <- importance(train.model, type = 1) index <- sort(abs(var.imp[,1]), decreasing = TRUE, index.return = TRUE)$ix - + inputs <- rownames(var.imp)[index] scores <- abs(var.imp[index,1]) norm.scores <- 100 * scores / sum(scores) @@ -198,7 +198,7 @@ main_function <- function(max.budget = 110, rnames <- inputs[remove.index] print('removing:') print(rnames) - + par.set1<-par.set0 pnames<-names(par.set$pars) for (index in c(1:k)){ @@ -228,7 +228,7 @@ main_function <- function(max.budget = 110, temp<-rbind(design,reqDF[,-1]) design <- head(temp, n = propose.points) yvals <- predict(train.model,design) - + USE_MODEL <- FALSE if(USE_MODEL){ design <- cbind(y=yvals, design) @@ -240,12 +240,12 @@ main_function <- function(max.budget = 110, res = mbo(obj.fun, design = design, learner = surr.rf, control = ctrl, show.info = TRUE) itr_res<-as.data.frame(res$opt.path) itr_res<-tail(itr_res, n = propose.points) - + par.set0<-par.set1 itr <- itr + 1 all_res <- rbind(all_res, itr_res) } - + return(all_res) } diff --git a/workflows/common/R/mlrMBO-ls3.R b/workflows/common/R/mlrMBO-ls3.R index c12a972b..cdd38db0 100644 --- a/workflows/common/R/mlrMBO-ls3.R +++ b/workflows/common/R/mlrMBO-ls3.R @@ -68,16 +68,16 @@ main_function <- function(max.budget = 110, max.iterations = 10, design.size=10, propose.points=10, - restart.file="DISABLED", + restart.file="DISABLED", learner1.name = "randomForest") { if (learner1.name == "km"){ print("Using Kriging.") - surr.rf = makeLearner("regr.km", predict.type = "se") #covtype = "matern3_2", control = list(trace = FALSE)) + surr.rf = makeLearner("regr.km", predict.type = "se") #covtype = "matern3_2", control = list(trace = FALSE)) - #TODO: Avoid error: + #TODO: Avoid error: # [mbo] 3: latent_dim=2; batch_size=35; learning_rate=0.0762; epochs=8 : y = 0.203 : 29.6 secs : infill_cb - # Error in chol.default(R) : + # Error in chol.default(R) : # the leading minor of order 29 is not positive definite # The issue is mentioned here: https://github.com/mlr-org/mlrMBO/issues/80 # y = MyTrainingData$MyTarget @@ -86,7 +86,7 @@ main_function <- function(max.budget = 110, ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$integer.max * 0.1) - + # y = MyTrainingData$MyTarget # Nuggets = 1e-8*var(y) # setHyperPars(learner = surr.rf, nugget=Nuggets) @@ -98,15 +98,15 @@ main_function <- function(max.budget = 110, } else if (learner1.name == "randomForest"){ print("Using randomForest") - surr.rf = makeLearner("regr.randomForest", predict.type = "se", + surr.rf = makeLearner("regr.randomForest", predict.type = "se", fix.factors.prediction = TRUE, - se.method = "bootstrap", + se.method = "bootstrap", se.boot = 2, se.ntree = 10, ntree=1000, mtry=8) - ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, + ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$integer.max * 0.1 ) ctrl = setMBOControlTermination(ctrl, max.evals = propose.points) - ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritEI(se.threshold = 0.0), + ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritEI(se.threshold = 0.0), opt.restarts = 1, opt.focussearch.points = 1000) } else{ @@ -117,7 +117,7 @@ main_function <- function(max.budget = 110, chkpntResults<-NULL # TODO: Make this an argument - restartFile<-restart.file + restartFile<-restart.file if (file.exists(restart.file)) { print(paste("Loading restart:", restart.file)) @@ -167,28 +167,28 @@ main_function <- function(max.budget = 110, #iterative phase starts while (itr < max_itr){ - + min.index<-which(itr_res$y==min(itr_res$y)) - + par.set.t = par.set0 pars = par.set.t$pars lens = getParamLengths(par.set.t) k = sum(lens) pids = getParamIds(par.set.t, repeated = TRUE, with.nr = TRUE) - + snames = c("y", pids) reqDF = subset(itr_res, select = snames, drop =TRUE) bestDF <- reqDF[min.index,] print("reqDF") print(nrow(reqDF)) print(summary(reqDF)) - + train.model <- randomForest(y ~ ., data=reqDF, ntree=100000, keep.forest=TRUE, importance=TRUE) var.imp <- importance(train.model, type = 1) index <- sort(abs(var.imp[,1]), decreasing = TRUE, index.return = TRUE)$ix - + inputs <- rownames(var.imp)[index] scores <- abs(var.imp[index,1]) norm.scores <- 100 * scores / sum(scores) @@ -198,7 +198,7 @@ main_function <- function(max.budget = 110, rnames <- inputs[remove.index] print('removing:') print(rnames) - + par.set1<-par.set0 pnames<-names(par.set$pars) @@ -235,7 +235,7 @@ main_function <- function(max.budget = 110, temp<-rbind(design,reqDF[,-1]) design <- head(temp, n = propose.points) yvals <- predict(train.model,design) - + USE_MODEL <- TRUE if(USE_MODEL){ design <- cbind(y=yvals, design) @@ -247,12 +247,12 @@ main_function <- function(max.budget = 110, res = mbo(obj.fun, design = design, learner = surr.rf, control = ctrl, show.info = TRUE) itr_res<-as.data.frame(res$opt.path) itr_res<-tail(itr_res, n = propose.points) - + par.set0<-par.set1 itr <- itr + 1 all_res <- rbind(all_res, itr_res) } - + return(all_res) } diff --git a/workflows/common/R/mlrMBO-mbo.R b/workflows/common/R/mlrMBO-mbo.R index 98a9702b..4a5869d7 100644 --- a/workflows/common/R/mlrMBO-mbo.R +++ b/workflows/common/R/mlrMBO-mbo.R @@ -1,6 +1,14 @@ + # mlrMBO EMEWS Algorithm Wrapper + set.seed(12345) + options(warn=2) + options(error=function()traceback(2)) - # mlrMBO EMEWS Algorithm Wrapper +options( + parallelMap.default.mode = "local", + parallelMap.default.cpus = 1, + parallelMap.default.show.info = TRUE +) emews_root <- Sys.getenv("EMEWS_PROJECT_ROOT") if (emews_root == "") { @@ -22,30 +30,52 @@ level = NA_character_, show.info = NA){ st = proc.time() - + print("parallelMap2() ...") + mode <- deparse(substitute(fun)) + print(paste0("mode: ", mode)) #For wrapFun do this: initdesign - if (deparse(substitute(fun)) == "wrapFun"){ + if (mode == "wrapFun"){ + print("wrapFun") dots <- list(...) string_params <- elements_of_lists_to_json(dots[[1L]]) # print(dots) # print(paste0("parallelMap2 called with list_param: ",string_params)) - # print(paste("parallelMap2 called with list size:", length(string_params))) + print(paste("mlrMBO: produced task count: ", length(dots[[1L]]))) OUT_put(string_params) string_results = IN_get() - st = proc.time() - st # Assumes results are in the form a;b;c # Note: can also handle vector returns for each, # i.e., a,b;c,d;e,f + print(paste0("mlrMBO: received result: ", string_results)) res <- string_to_list_of_vectors(string_results) + print(paste("mlrMBO: received result count:", length(res))) # using dummy time - return(result_with_extras_if_exist(res,st[3])) + extras = result_with_extras_if_exist(res,st[3]) + print(paste0("mlrMBO: extras: ", extras)) + return(extras) } - # For all other values of deparse(substitute(fun)) eg. proposePointsByInfillOptimization, doBaggingTrainIteration etc. - else{ - return(pm(fun, ..., more.args = more.args, simplify = simplify, use.names = use.names, impute.error = impute.error, - level = level, show.info = show.info)) + # For all other values of deparse(substitute(fun)) eg. + # proposePointsByInfillOptimization, doBaggingTrainIteration etc. + else { + print("pm() ...") + # tryCatch( + ## pm_out <- pm(fun, ..., more.args = more.args, simplify = simplify, + ## use.names = use.names, impute.error = impute.error, + ## level = level, show.info = show.info) + # , + # error=function(e){print(paste("CATCH: ", e))}) + dots <- list(...) + print(paste0("dots: ", dots)) + flush.console() + pm_out <- fun(opt.state=...) +## , more.args = more.args, simplify = simplify, +## use.names = use.names, impute.error = impute.error, +## level = level, show.info = show.info) + print(paste0("pm_out ...", pm_out)) + flush.console() + return(pm_out) } } @@ -64,33 +94,35 @@ # dummy objective function simple.obj.fun = function(x){} - main_function <- function(max.budget = 110, + main_function <- function(max.budget = 1000, max.iterations = 10, design.size=10, propose.points=10, restart.file) { print("Using randomForest") - surr.rf = makeLearner("regr.randomForest", - predict.type = "se", + surr.rf = makeLearner("regr.randomForest", + predict.type = "se", fix.factors.prediction = TRUE, - se.method = "jackknife", + se.method = "jackknife", se.boot = 2) - ctrl = makeMBOControl(n.objectives = 1, + ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, - impute.y.fun = function(x, y, opt.path, ...) .Machine$double.xmax, - trafo.y.fun = makeMBOTrafoFunction('log', log)) - ctrl = setMBOControlInfill(ctrl, - crit = makeMBOInfillCritCB(), - opt.restarts = 1, - opt.focussearch.points = 1000) - ctrl = setMBOControlTermination(ctrl, - max.evals = max.budget, + impute.y.fun = function(x, y, opt.path, ...) .Machine$double.xmax) + ctrl = setMBOControlInfill(ctrl, + crit = crit.cb +# makeMBOInfillCritCB(), + # opt.restarts = 1 +# , +# opt.focussearch.points = 1000 +) + ctrl = setMBOControlTermination(ctrl, + max.evals = max.budget, iters = max.iterations) chkpntResults<-NULL # TODO: Make this an argument - restartFile<-restart.file + restartFile<-restart.file if (file.exists(restart.file)) { print(paste("Loading restart:", restart.file)) @@ -143,9 +175,12 @@ } # each discrete variable should be represented once, else optimization will fail # this checks if design size is less than max number of discrete values - print(paste0("design size=", design.size, " must be greater or equal to maximum discrete values=", max_val_discrete)) if (design.size < max_val_discrete){ - print("Aborting! design.size is less than the discrete parameters specified") + print(paste0("design size=", design.size, + " must be >= to maximum discrete values=", + max_val_discrete)) + print("Aborting! design.size < the discrete parameters specified") + flush.console() quit() } @@ -167,8 +202,10 @@ design = chkpntResults } # print(paste("design:", design)) - configureMlr(show.info = FALSE, show.learner.output = FALSE, on.learner.warning = "quiet") - res = mbo(obj.fun, design = design, learner = NULL, control = ctrl, show.info = TRUE) + configureMlr() + # show.info = FALSE, show.learner.output = FALSE, on.learner.warning = "quiet") + res = mbo(obj.fun, design = design, learner = NULL, control = ctrl, + show.info = TRUE) return(res) } @@ -179,7 +216,7 @@ # This is a string of R code containing arguments to main_function(), # e.g., "max.budget = 110, max.iterations = 10, design.size = 10, ..." msg <- IN_get() - print(paste("Received params1 msg: ", msg)) + cat(paste0("Received mlrMBO configuration parameters msg: ", msg)) # Edit the R code to make a list constructor expression code = paste0("list(",msg,")") @@ -206,8 +243,10 @@ turbine_output <- Sys.getenv("TURBINE_OUTPUT") if (turbine_output != "") { + print(paste0("setwd(): ", turbine_output)) setwd(turbine_output) } + print("saving final_res.Rds ...") # This will be saved to experiment directory saveRDS(final_res,file = "final_res.Rds") diff --git a/workflows/common/R/mlrMBO-rs.R b/workflows/common/R/mlrMBO-rs.R index 6d352719..0a8ef054 100644 --- a/workflows/common/R/mlrMBO-rs.R +++ b/workflows/common/R/mlrMBO-rs.R @@ -69,25 +69,25 @@ restart.file) { print("Using randomForest") - surr.rf = makeLearner("regr.randomForest", - predict.type = "se", + surr.rf = makeLearner("regr.randomForest", + predict.type = "se", fix.factors.prediction = TRUE) #mtry = 6, #se.method = "bootstrap", se.boot = 50, se.ntree = 100) - ctrl = makeMBOControl(n.objectives = 1, + ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$double.xmax ) - ctrl = setMBOControlInfill(ctrl, + ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritCB(), - opt.restarts = 1, + opt.restarts = 1, opt.focussearch.points = 1000) - ctrl = setMBOControlTermination(ctrl, - max.evals = max.budget, + ctrl = setMBOControlTermination(ctrl, + max.evals = max.budget, iters = max.iterations) chkpntResults<-NULL # TODO: Make this an argument - restartFile<-restart.file + restartFile<-restart.file if (file.exists(restart.file)) { print(paste("Loading restart:", restart.file)) @@ -119,7 +119,7 @@ if (is.null(chkpntResults)){ par.set = getParamSet(obj.fun) - + ## represent each discrete value once # get the maximum number of variables max_val_discrete = 0 diff --git a/workflows/common/R/mlrMBO1.R b/workflows/common/R/mlrMBO1.R index 6f02de3c..f651d19e 100644 --- a/workflows/common/R/mlrMBO1.R +++ b/workflows/common/R/mlrMBO1.R @@ -70,7 +70,7 @@ restart.file) { print("Using randomForest") - surr.rf = makeLearner("regr.randomForest", predict.type = "se", + surr.rf = makeLearner("regr.randomForest", predict.type = "se", fix.factors.prediction = TRUE, mtry = 6, se.method = "bootstrap", se.boot = 50, se.ntree = 100) @@ -83,7 +83,7 @@ chkpntResults<-NULL # TODO: Make this an argument - restartFile<-restart.file + restartFile<-restart.file if (file.exists(restart.file)) { print(paste("Loading restart1:", restart.file)) diff --git a/workflows/common/R/mlrMBO2.R b/workflows/common/R/mlrMBO2.R index 7a322909..62f071d6 100644 --- a/workflows/common/R/mlrMBO2.R +++ b/workflows/common/R/mlrMBO2.R @@ -67,16 +67,16 @@ main_function <- function(max.budget = 110, max.iterations = 10, design.size=10, propose.points=10, - restart.file="DISABLED", + restart.file="DISABLED", learner1.name = "randomForest") { if (learner1.name == "km"){ print("Using Kriging.") - surr.rf = makeLearner("regr.km", predict.type = "se") #covtype = "matern3_2", control = list(trace = FALSE)) + surr.rf = makeLearner("regr.km", predict.type = "se") #covtype = "matern3_2", control = list(trace = FALSE)) - #TODO: Avoid error: + #TODO: Avoid error: # [mbo] 3: latent_dim=2; batch_size=35; learning_rate=0.0762; epochs=8 : y = 0.203 : 29.6 secs : infill_cb - # Error in chol.default(R) : + # Error in chol.default(R) : # the leading minor of order 29 is not positive definite # The issue is mentioned here: https://github.com/mlr-org/mlrMBO/issues/80 # y = MyTrainingData$MyTarget @@ -85,7 +85,7 @@ main_function <- function(max.budget = 110, ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$integer.max * 0.1) - + # y = MyTrainingData$MyTarget # Nuggets = 1e-8*var(y) # setHyperPars(learner = surr.rf, nugget=Nuggets) @@ -97,15 +97,15 @@ main_function <- function(max.budget = 110, } else if (learner1.name == "randomForest"){ print("Using randomForest") - surr.rf = makeLearner("regr.randomForest", predict.type = "se", + surr.rf = makeLearner("regr.randomForest", predict.type = "se", fix.factors.prediction = TRUE, - se.method = "bootstrap", + se.method = "bootstrap", se.boot = 2, se.ntree = 10, ntree=1000, mtry=8) - ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, + ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$integer.max * 0.1 ) ctrl = setMBOControlTermination(ctrl, max.evals = propose.points) - ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritEI(se.threshold = 0.0), + ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritEI(se.threshold = 0.0), opt.restarts = 1, opt.focussearch.points = 1000) } else{ @@ -116,7 +116,7 @@ main_function <- function(max.budget = 110, chkpntResults<-NULL # TODO: Make this an argument - restartFile<-restart.file + restartFile<-restart.file if (file.exists(restart.file)) { print(paste("Loading restart:", restart.file)) @@ -156,10 +156,10 @@ main_function <- function(max.budget = 110, configureMlr(show.info = FALSE, show.learner.output = FALSE, on.learner.warning = "quiet") res = mbo(obj.fun, design = design, learner = surr.rf, control = ctrl, show.info = TRUE) #return(res) - + init_res<-as.data.frame(res$opt.path) min.index<-which(init_res$y==min(init_res$y))[1] - + par.set = getParamSet(obj.fun) pars = par.set$pars lens = getParamLengths(par.set) diff --git a/workflows/common/R/mlrMBO2a.R b/workflows/common/R/mlrMBO2a.R index 274693cf..b182518b 100644 --- a/workflows/common/R/mlrMBO2a.R +++ b/workflows/common/R/mlrMBO2a.R @@ -71,7 +71,7 @@ restart.file) { print("Using randomForest") - surr.rf = makeLearner("regr.randomForest", predict.type = "se", + surr.rf = makeLearner("regr.randomForest", predict.type = "se", fix.factors.prediction = TRUE, mtry = 6, se.method = "bootstrap", se.boot = 50, se.ntree = 100) @@ -84,7 +84,7 @@ chkpntResults<-NULL # TODO: Make this an argument - restartFile<-restart.file + restartFile<-restart.file if (file.exists(restart.file)) { print(paste("Loading restart:", restart.file)) @@ -135,30 +135,30 @@ #iterative phase starts while (itr < max_itr){ - + print(sprintf("nevals = %03d", nrow(all_res))) min.index<-which(itr_res$y==min(itr_res$y)) - + par.set.t = par.set0 pars = par.set.t$pars lens = getParamLengths(par.set.t) k = sum(lens) pids = getParamIds(par.set.t, repeated = TRUE, with.nr = TRUE) - + snames = c("y", pids) reqDF = subset(itr_res, select = snames, drop =TRUE) bestDF <- reqDF[min.index,] print("reqDF") print(nrow(reqDF)) print(summary(reqDF)) - + train.model <- randomForest(log(y) ~ ., data=reqDF, ntree=100000, keep.forest=TRUE, importance=TRUE) var.imp <- importance(train.model, type = 1) var.imp[which(var.imp[,1] < 0),1]<-0 index <- sort(abs(var.imp[,1]), decreasing = TRUE, index.return = TRUE)$ix - + inputs <- rownames(var.imp)[index] scores <- abs(var.imp[index,1]) norm.scores <- 100 * scores / sum(scores) @@ -168,7 +168,7 @@ rnames <- inputs[remove.index] print('removing:') print(rnames) - + par.set1<-par.set0 pnames<-names(par.set$pars) print(par.set1) @@ -201,7 +201,7 @@ temp<-rbind(design,reqDF[,-1]) design <- head(temp, n = propose.points) yvals <- predict(train.model,design) - + USE_MODEL <- FALSE #TRUE if(USE_MODEL){ design <- cbind(y=yvals, design) @@ -213,7 +213,7 @@ res = mbo(obj.fun, design = design, learner = surr.rf, control = ctrl, show.info = TRUE) itr_res<-as.data.frame(res$opt.path) itr_res<-tail(itr_res, n = propose.points) - + par.set0<-par.set1 itr <- itr + 1 all_res <- rbind(all_res, itr_res) diff --git a/workflows/common/R/mlrMBO_km.R b/workflows/common/R/mlrMBO_km.R index 186a1417..ca706f0f 100644 --- a/workflows/common/R/mlrMBO_km.R +++ b/workflows/common/R/mlrMBO_km.R @@ -69,11 +69,11 @@ main_function <- function(max.budget = 110, propose.points=10, restart.file) { print("Using Kriging.") - surr.rf = makeLearner("regr.km", predict.type = "se") #covtype = "matern3_2", control = list(trace = FALSE)) + surr.rf = makeLearner("regr.km", predict.type = "se") #covtype = "matern3_2", control = list(trace = FALSE)) - #TODO: Avoid error: + #TODO: Avoid error: # [mbo] 3: latent_dim=2; batch_size=35; learning_rate=0.0762; epochs=8 : y = 0.203 : 29.6 secs : infill_cb - # Error in chol.default(R) : + # Error in chol.default(R) : # the leading minor of order 29 is not positive definite # The issue is mentioned here: https://github.com/mlr-org/mlrMBO/issues/80 # y = MyTrainingData$MyTarget @@ -82,7 +82,7 @@ main_function <- function(max.budget = 110, ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, impute.y.fun = function(x, y, opt.path, ...) .Machine$integer.max * 0.1) - + # y = MyTrainingData$MyTarget # Nuggets = 1e-8*var(y) # setHyperPars(learner = surr.rf, nugget=Nuggets) @@ -93,7 +93,7 @@ main_function <- function(max.budget = 110, chkpntResults<-NULL # TODO: Make this an argument - restartFile<-restart.file + restartFile<-restart.file if (file.exists(restart.file)) { print(paste("Loading restart:", restart.file)) diff --git a/workflows/common/R/test/ils-test.R b/workflows/common/R/test/ils-test.R index a94b1667..71321d83 100644 --- a/workflows/common/R/test/ils-test.R +++ b/workflows/common/R/test/ils-test.R @@ -10,48 +10,48 @@ library(randomForest) fun = function(x) { x = as.list(x) - res = 0 + res = 0 print(x) print(paste(x,sep=",",collapse=";")) r = as.numeric(x$batch_size) i = as.numeric(x$drop) res<-r+i - + if(x$model=="ae"){ res<-res*1000 } - + if(x$activation == "relu"){ res<-res*1000 } - + if(x$optimizer == "sgd"){ res<-res*1000 } - + if(x$optimizer == "sgd"){ res<-res*1000 - } - + } + if(as.numeric(x$reduce_lr)){ res<-res*1000 } - + return(res) } par.set = makeParamSet( # we optimize for ae and vae separately makeDiscreteParam("model", values=c("ae")), - + # makeDiscreteParam("latent_dim", values=c(2, 8, 32, 128, 512)), makeIntegerParam("latent_dim", lower=1, upper=9, trafo = function(x) 2L^x), # use a subset of 978 landmark features only to speed up training makeDiscreteParam("use_landmark_genes", values=c(0)), - - + + # use consecutive 978-neuron layers to facilitate residual connections # makeDiscreteParam("dense", values=c("1500 500", # "978 978", @@ -59,21 +59,21 @@ par.set = makeParamSet( # "978 978 978 978", # "978 978 978 978 978", # "978 978 978 978 978 978")), - + makeDiscreteParam("residual", values=c(1, 0)), - + makeDiscreteParam("activation", values=c("relu", "sigmoid", "tanh")), - + makeDiscreteParam("optimizer", values=c("adam", "sgd")), - + makeNumericParam("learning_rate", lower=0.00001, upper=0.1), - + makeDiscreteParam("reduce_lr", values=c(1, 0)), - + makeDiscreteParam("warmup_lr", values=c(1, 0)), - + makeNumericParam("drop", lower=0, upper=0.9), - + makeIntegerParam("epochs", lower=2, upper=3) ) @@ -89,13 +89,13 @@ max.budget <- 1500 propose.points<-9 max.iterations<-5 -ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, +ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, trafo.y.fun = makeMBOTrafoFunction('log', log), impute.y.fun = function(x, y, opt.path, ...) .Machine$double.xmax ) ctrl = setMBOControlTermination(ctrl, max.evals = max.budget, iters = max.iterations) -ctrl = setMBOControlInfill(ctrl, - crit = makeMBOInfillCritCB(), - opt.restarts = 1, +ctrl = setMBOControlInfill(ctrl, + crit = makeMBOInfillCritCB(), + opt.restarts = 1, opt.focussearch.points = 1000) # d1 = generateGridDesign(par.set, trafo = TRUE) @@ -151,10 +151,10 @@ ptm <- proc.time() # dummy objective function simple.obj.fun = function(x){} -surr.rf = makeLearner("regr.randomForest", - predict.type = "se", +surr.rf = makeLearner("regr.randomForest", + predict.type = "se", fix.factors.prediction = TRUE, - se.method = "jackknife", + se.method = "jackknife", se.boot = 8) @@ -178,20 +178,20 @@ surr.rf = makeLearner("regr.randomForest", time <-(proc.time() - ptm) print(sprintf("nevals = %03d; itr = %03d; time = %5.5f;", nrow(all_res), itr, as.numeric(time[3]))) min.index<-which(itr_res$y==min(itr_res$y)) - + par.set.t = par.set0 pars = par.set.t$pars lens = getParamLengths(par.set.t) k = sum(lens) pids = getParamIds(par.set.t, repeated = TRUE, with.nr = TRUE) - + snames = c("y", pids) reqDF = subset(itr_res, select = snames, drop =TRUE) bestDF <- reqDF[min.index,] print("reqDF") print(nrow(reqDF)) print(summary(reqDF)) - + print("itr-rf") train.model <- randomForest(log(y) ~ ., data=reqDF, ntree=10000, keep.forest=TRUE, importance=TRUE) var.imp <- importance(train.model, type = 1) @@ -199,7 +199,7 @@ surr.rf = makeLearner("regr.randomForest", index <- sort(abs(var.imp[,1]), decreasing = TRUE, index.return = TRUE)$ix - + inputs <- rownames(var.imp)[index] scores <- var.imp[index,1] remove.index <- which(scores >= 0.9*max(scores)) @@ -208,7 +208,7 @@ surr.rf = makeLearner("regr.randomForest", print('removing:') print(rnames) - + par.set1<-par.set0 pnames<-names(par.set$pars) print(par.set1) @@ -239,7 +239,7 @@ surr.rf = makeLearner("regr.randomForest", } else { par.set1$pars[[index]]<-makeNumericParam(p, lower=ll, upper=uu, trafo = trafo) } - } + } } } } @@ -258,8 +258,8 @@ surr.rf = makeLearner("regr.randomForest", temp<-rbind(design,reqDF[,-1]) design <- head(temp, n = propose.points) - - + + USE_MODEL <- TRUE if(USE_MODEL){ yvals <- predict(train.model,design) @@ -270,13 +270,13 @@ surr.rf = makeLearner("regr.randomForest", } print("mbo-itr") print(yvals) - + print(summary(yvals)) res = mbo(obj.fun, design = design, learner = surr.rf, control = ctrl, show.info = FALSE) itr_res<-as.data.frame(res$opt.path) itr_res<-cbind(itr_res, stime = as.numeric(time[3])) itr_res<-tail(itr_res, n = propose.points) - + par.set0<-par.set1 itr <- itr + 1 print("bug msg:") diff --git a/workflows/common/R/test/learner-discrete-param-bug.R b/workflows/common/R/test/learner-discrete-param-bug.R index 4af45b36..1b62abc2 100644 --- a/workflows/common/R/test/learner-discrete-param-bug.R +++ b/workflows/common/R/test/learner-discrete-param-bug.R @@ -16,27 +16,27 @@ fun = function(x) { r = as.numeric(x$batch_size) i = as.numeric(x$drop) res<-r+i - + if(x$model=="ae"){ res<-res*1000 } - + if(x$activation == "relu"){ res<-res*1000 } - + if(x$optimizer == "sgd"){ res<-res*1000 } - + if(x$optimizer == "sgd"){ res<-res*1000 - } - + } + if(as.numeric(x$reduce_lr)){ res<-res*1000 } - + return(res) } @@ -49,7 +49,7 @@ par.set = makeParamSet( # use a subset of 978 landmark features only to speed up training makeDiscreteParam("use_landmark_genes", values=c(1)), # large batch_size only makes sense when warmup_lr is on - # makeDiscreteParam("batch_size", values=c(32, 64, 128, 256, 512, 1024), + # makeDiscreteParam("batch_size", values=c(32, 64, 128, 256, 512, 1024), makeIntegerParam("batch_size", lower=5, upper=10, trafo = function(x) 2L^x), # use consecutive 978-neuron layers to facilitate residual connections makeDiscreteParam("dense", values=c("1500 500", @@ -80,13 +80,13 @@ max.budget <- 1500 propose.points<-9 max.iterations<-5 -ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, +ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, trafo.y.fun = makeMBOTrafoFunction('log', log), impute.y.fun = function(x, y, opt.path, ...) .Machine$double.xmax ) ctrl = setMBOControlTermination(ctrl, max.evals = max.budget, iters = max.iterations) -ctrl = setMBOControlInfill(ctrl, - crit = makeMBOInfillCritCB(), - opt.restarts = 1, +ctrl = setMBOControlInfill(ctrl, + crit = makeMBOInfillCritCB(), + opt.restarts = 1, opt.focussearch.points = 1000) # d1 = generateGridDesign(par.set, trafo = TRUE) @@ -138,10 +138,10 @@ for (v in par.set$pars){ design=mydesign -surr.rf = makeLearner("regr.randomForest", - predict.type = "se", +surr.rf = makeLearner("regr.randomForest", + predict.type = "se", fix.factors.prediction = TRUE, - se.method = "jackknife", + se.method = "jackknife", se.boot = 8) diff --git a/workflows/common/R/test/mlrMBOMixedIntegerTest11a.R b/workflows/common/R/test/mlrMBOMixedIntegerTest11a.R index eec3496b..68aaa098 100644 --- a/workflows/common/R/test/mlrMBOMixedIntegerTest11a.R +++ b/workflows/common/R/test/mlrMBOMixedIntegerTest11a.R @@ -20,23 +20,23 @@ fun = function(x) { if(x$model=="ae"){ res<-res*1000 } - + if(x$activation == "relu"){ res<-res*1000 } - + if(x$optimizer == "sgd"){ res<-res*1000 } if(x$optimizer == "sgd"){ res<-res*1000 - } - + } + if(as.numeric(x$reduce_lr)){ res<-res*1000 } - + return(res) } @@ -49,7 +49,7 @@ par.set = makeParamSet( # use a subset of 978 landmark features only to speed up training makeDiscreteParam("use_landmark_genes", values=c(1)), # large batch_size only makes sense when warmup_lr is on - # makeDiscreteParam("batch_size", values=c(32, 64, 128, 256, 512, 1024), + # makeDiscreteParam("batch_size", values=c(32, 64, 128, 256, 512, 1024), makeIntegerParam("batch_size", lower=5, upper=10, trafo = function(x) 2L^x), # use consecutive 978-neuron layers to facilitate residual connections makeDiscreteParam("dense", values=c("1500 500", @@ -79,13 +79,13 @@ obj.fun = makeSingleObjectiveFunction( max.budget <- 1500 propose.points<-5 -ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, +ctrl = makeMBOControl(n.objectives = 1, propose.points = propose.points, trafo.y.fun = makeMBOTrafoFunction('log', log), impute.y.fun = function(x, y, opt.path, ...) .Machine$double.xmax ) ctrl = setMBOControlTermination(ctrl, max.evals = max.budget) -ctrl = setMBOControlInfill(ctrl, +ctrl = setMBOControlInfill(ctrl, crit = makeMBOInfillCritCB(), - opt.restarts = 1, + opt.restarts = 1, opt.focussearch.points = 1000) @@ -94,4 +94,3 @@ design = head(design, n = propose.points) configureMlr(show.info = FALSE, show.learner.output = FALSE, on.learner.warning = "quiet") res = mbo(obj.fun, design = design, learner = NULL, control = ctrl, show.info = TRUE) - diff --git a/workflows/common/R/test/test_utils.R b/workflows/common/R/test/test_utils.R index 7fac7440..0ebb9dfa 100644 --- a/workflows/common/R/test/test_utils.R +++ b/workflows/common/R/test/test_utils.R @@ -1,4 +1,4 @@ -# Split the string pushed into OUT_put into +# Split the string pushed into OUT_put into # list of numerical vectors (used in simple_mlrMBO_run_test.R) split.into.param.lines <- function(x){ res1 <- unlist(strsplit(x,split = ";")) @@ -17,4 +17,4 @@ split.json.into.dummy.param.lines <- function(x){ make.into.q.res <- function(x){ paste0(x,collapse = ";") -} \ No newline at end of file +} diff --git a/workflows/common/db/candle_sql.py b/workflows/common/db/candle_sql.py index 1f653d51..c92faaf2 100644 --- a/workflows/common/db/candle_sql.py +++ b/workflows/common/db/candle_sql.py @@ -1,34 +1,31 @@ - import datetime import logging import os import sqlite3 import sys + def setup_db(db_file): - """ - Convenience function to use from Swift/T - """ - if 'DB' not in globals(): - rank = os.getenv('PMIX_RANK') - print('rank %s Connecting to DB...' % rank) + """Convenience function to use from Swift/T.""" + if "DB" not in globals(): + rank = os.getenv("PMIX_RANK") + print("rank %s Connecting to DB..." % rank) global DB DB = candle_sql(db_file) return DB + class candle_sql: def __init__(self, db_file, log=False): - """ - Sets up a wrapper around the SQL connection and cursor objects - Also caches dicts that convert between names and ids for the - features and studies tables - """ - #self.conn = sqlite3.connect(db_file) - #self.cursor = self.conn.cursor() + """Sets up a wrapper around the SQL connection and cursor objects Also + caches dicts that convert between names and ids for the features and + studies tables.""" + # self.conn = sqlite3.connect(db_file) + # self.cursor = self.conn.cursor() self.db_file = db_file - self.autoclose = True - self.logger = None # Default + self.autoclose = True + self.logger = None # Default if log: logging.basicConfig(format="SQL: %(message)s") self.logger = logging.getLogger("candle_sql") @@ -40,10 +37,11 @@ def connect(self): self.cursor.execute("PRAGMA busy_timeout = 30000") def insert(self, table, names, values): - """ Do a SQL insert """ - names_tpl = sql_tuple(names) + """Do a SQL insert.""" + names_tpl = sql_tuple(names) values_tpl = sql_tuple(values) - cmd = "insert into {} {} values {};".format(table, names_tpl, values_tpl) + cmd = "insert into {} {} values {};".format(table, names_tpl, + values_tpl) self.execute(cmd) rowid = str(self.cursor.lastrowid) return rowid @@ -75,19 +73,22 @@ def __del__(self): def q(s): - """ Quote the given string """ + """Quote the given string.""" return "'" + str(s) + "'" + def qL(L): - """ Quote each list entry as a string """ + """Quote each list entry as a string.""" return map(q, L) + def qA(*args): - """ Quote each argument as a string """ + """Quote each argument as a string.""" return map(q, args) + def sql_tuple(L): - """ Make the given list into a SQL-formatted tuple """ + """Make the given list into a SQL-formatted tuple.""" result = "" result += "(" result += ",".join(L) diff --git a/workflows/common/ext/EQ-Py/eqpy.py b/workflows/common/ext/EQ-Py/eqpy.py index ace77806..96ac970f 100644 --- a/workflows/common/ext/EQ-Py/eqpy.py +++ b/workflows/common/ext/EQ-Py/eqpy.py @@ -1,6 +1,7 @@ -import threading +import importlib import sys -import importlib, traceback +import threading +import traceback EQPY_ABORT = "EQPY_ABORT" @@ -17,6 +18,7 @@ aborted = False wait_info = None + class WaitInfo: def __init__(self): @@ -27,6 +29,7 @@ def getWait(self): self.wait += 1 return self.wait + class ThreadRunner(threading.Thread): def __init__(self, runnable): @@ -41,6 +44,7 @@ def run(self): # tuple of type, value and traceback self.exc = traceback.format_exc() + def init(pkg): global p, wait_info wait_info = WaitInfo() @@ -48,6 +52,7 @@ def init(pkg): p = ThreadRunner(imported_pkg) p.start() + def output_q_get(): global output_q, aborted wait = wait_info.getWait() @@ -71,10 +76,12 @@ def output_q_get(): return result + def OUT_put(string_params): output_q.put(string_params) + def IN_get(): - #global input_q + # global input_q result = input_q.get() return result diff --git a/workflows/common/ext/EQ-R/eqr/BlockingQueue.h b/workflows/common/ext/EQ-R/eqr/BlockingQueue.h index c9dfd41c..a9f983da 100644 --- a/workflows/common/ext/EQ-R/eqr/BlockingQueue.h +++ b/workflows/common/ext/EQ-R/eqr/BlockingQueue.h @@ -24,7 +24,7 @@ class BlockingQueue { } this->d_condition.notify_one(); } - + T pop() { std::unique_lock lock(this->d_mutex); // [ capture-list ] ( params ) { body } diff --git a/workflows/common/python/deap_ga.py b/workflows/common/python/deap_ga.py index 046ff2f6..24280619 100644 --- a/workflows/common/python/deap_ga.py +++ b/workflows/common/python/deap_ga.py @@ -1,69 +1,141 @@ -import threading -import random -import time -import math +""" +DEAP GA PY + +EMEWS interface module for DEAP +""" + import csv import json +import math +import random +import sys +import threading import time +import log_tools +import eqpy +import ga_utils import numpy as np +from deap import algorithms, base, creator, tools -from deap import base -from deap import creator -from deap import tools -from deap import algorithms +# List of ga_utils parameter objects: +ga_params = None -import eqpy, ga_utils +# Last mean value (used if there are no new values): +mean_last = None + +generation = 1 +logger = log_tools.get_logger(None, "DEAP") -# list of ga_utils parameter objects -ga_params = None def obj_func(x): + """ + Just a stub for the DEAP framework + """ return 0 -# {"batch_size":512,"epochs":51,"activation":"softsign", -#"dense":"2000 1000 1000 500 100 50","optimizer":"adagrad","drop":0.1378, -#"learning_rate":0.0301,"conv":"25 25 25 25 25 1"} -def create_list_of_json_strings(list_of_lists, super_delim=";"): - # create string of ; separated jsonified maps - res = [] + +def create_list_of_json_strings(list_of_lists, super_delimiter=";"): + """ + create string of semicolon-separated jsonified maps + Produces something like: + {"batch_size":512,"epochs":51,"activation":"softsign", + "dense":"2000 1000 1000 500 100 50","optimizer":"adagrad","drop":0.1378, + "learning_rate":0.0301,"conv":"25 25 25 25 25 1"} + """ + result = [] global ga_params - for l in list_of_lists: - jmap = {} - for i,p in enumerate(ga_params): - jmap[p.name] = l[i] + for L in list_of_lists: + json_string = create_json_string(L) + result.append(json_string) + return super_delimiter.join(result) + - jstring = json.dumps(jmap) - res.append(jstring) +def create_json_string(L, indent=None): + json_dict = {} + for i, p in enumerate(ga_params): + json_dict[p.name] = L[i] + result = json.dumps(json_dict, indent=indent) + return result - return (super_delim.join(res)) def create_fitnesses(params_string): - """return equivalent length tuple list + """ + return equivalent length tuple list. :type params_string: str """ params = params_string.split(";") # get length res = [(i,) for i in range(len(params))] - return (res) + return res + + +def make_floats(results): + """ + results: String of data from workflow + return: List of singleton-tuples, each a float + This function converts the workflow strings to the DEAP format, + and replaces any string NaNs in the results with + the mean of the current generation or + the mean of the prior generation. + """ + global mean_last + tokens = results.split(";") + NaNs = [] + values = [] + output = {} + floats = [] + for i, token in enumerate(tokens): + if token.lower() == "nan": + output[i] = "nan" + NaNs.append(i) + else: + f = float(token) + output[i] = f + values.append(f) + logger.info("RESULTS: values: %i NaNs: %i" % + (len(values), len(NaNs))) + if len(values) > 0: + mean = sum(values) / len(values) + mean_last = mean + else: + assert mean_last is not None, \ + "all generation=1 results are NaN!" + mean = mean_last -def queue_map(obj_func, pops): - # Note that the obj_func is not used - # sending data that looks like: - # [[a,b,c,d],[e,f,g,h],...] + for i in NaNs: + output[i] = mean + for i in range(0, len(tokens)): + floats.append((output[i],)) + return floats + + +def queue_map(_f, pops): + """ + Note that _f is not used, but is part of the DEAP framework + Formats model parameters that look like: + [[a,b,c,d],[e,f,g,h],...] + """ if not pops: return [] + global generation + generation_start = time.time() + logger.info("GENERATION: %i START: pop: %i" % + (generation, len(pops))) + sys.stdout.flush() eqpy.OUT_put(create_list_of_json_strings(pops)) - result = eqpy.IN_get() - split_result = result.split(';') - # TODO determine if max'ing or min'ing and use -9999999 or 99999999 - return [(float(x),) if not math.isnan(float(x)) else (float(99999999),) for x in split_result] - #return [(float(x),) for x in split_result] + results = eqpy.IN_get() + duration = time.time() - generation_start + logger.info("GENERATION: %i STOP. duration: %0.3f" % + (generation, duration)) + sys.stdout.flush() + generation += 1 + floats = make_floats(results) + return floats + def make_random_params(): - """ - Performs initial random draw on each parameter - """ + """Performs initial random draw on each parameter.""" global ga_params draws = [] @@ -72,26 +144,32 @@ def make_random_params(): return draws + def parse_init_params(params_file): init_params = [] with open(params_file) as f_in: reader = csv.reader(f_in) header = next(reader) for row in reader: - init_params.append(dict(zip(header,row))) + init_params.append(dict(zip(header, row))) return init_params + def update_init_pop(pop, params_file): - global ga_params - print("Reading initial population from {}".format(params_file)) + global ga_params, logger + logger.info("Reading initial population from {}".format(params_file)) + sys.stdout.flush() init_params = parse_init_params(params_file) if len(pop) > len(init_params): - raise ValueError("Not enough initial params to set the population: size of init params < population size") + raise ValueError( + "Not enough initial params to set the population: size of init params < population size" + ) for i, indiv in enumerate(pop): for j, param in enumerate(ga_params): indiv[j] = param.parse(init_params[i][param.name]) + # keep as reference for log type # def mutGaussian_log(x, mu, sigma, mi, mx, indpb): # if random.random() < indpb: @@ -101,11 +179,10 @@ def update_init_pop(pop, params_file): # x = math.pow(10, logx) # return x + # Returns a tuple of one individual def custom_mutate(individual, indpb): - """ - Mutates the values in list individual with probability indpb - """ + """Mutates the values in list individual with probability indpb.""" # Note, if we had some aggregate constraint on the individual # (e.g. individual[1] * individual[2] < 10), we could copy @@ -116,29 +193,41 @@ def custom_mutate(individual, indpb): for i, param in enumerate(ga_params): individual[i] = param.mutate(individual[i], mu=0, indpb=indpb) - return individual, + return (individual,) + def cxUniform(ind1, ind2, indpb): c1, c2 = tools.cxUniform(ind1, ind2, indpb) return (c1, c2) + def timestamp(scores): return str(time.time()) + def run(): """ :param num_iter: number of generations :param num_pop: size of population :param seed: random seed :param strategy: one of 'simple', 'mu_plus_lambda' - :param ga parameters file name: ga parameters file name (e.g., "ga_params.json") + :param ga parameters file name: ga parameters file name + (e.g., "ga_params.json") :param param_file: name of file containing initial parameters """ + global logger + start_time = time.time() + logger.info("OPTIMIZATION START") + sys.stdout.flush() + eqpy.OUT_put("Params") params = eqpy.IN_get() - # parse params - (num_iter, num_pop, seed, strategy, mut_prob, ga_params_file, param_file) = eval('{}'.format(params)) + # Evaluate and log the params given by the workflow level: + (num_iter, num_pop, seed, strategy, mut_prob, ga_params_file, + param_file) = eval("{}".format(params)) + log_params(logger, num_iter, num_pop, seed) + random.seed(seed) global ga_params ga_params = ga_utils.create_parameters(ga_params_file) @@ -171,27 +260,71 @@ def run(): # num_iter-1 generations since the initial population is evaluated once first mutpb = mut_prob - start_time = time.time() - if strategy == 'simple': - pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=mutpb, ngen=num_iter - 1, - stats=stats, halloffame=hof, verbose=True) - elif strategy == 'mu_plus_lambda': + + if strategy == "simple": + pop, log = algorithms.eaSimple( + pop, + toolbox, + cxpb=0.5, + mutpb=mutpb, + ngen=num_iter - 1, + stats=stats, + halloffame=hof, + verbose=True, + ) + elif strategy == "mu_plus_lambda": mu = int(math.floor(float(num_pop) * 0.5)) lam = int(math.floor(float(num_pop) * 0.5)) if mu + lam < num_pop: mu += num_pop - (mu + lam) - pop, log = algorithms.eaMuPlusLambda(pop, toolbox, mu=mu, lambda_=lam, - cxpb=0.5, mutpb=mutpb, ngen=num_iter - 1, - stats=stats, halloffame=hof, verbose=True) + pop, log = algorithms.eaMuPlusLambda( + pop, + toolbox, + mu=mu, + lambda_=lam, + cxpb=0.5, + mutpb=mutpb, + ngen=num_iter - 1, + stats=stats, + halloffame=hof, + verbose=True, + ) else: - raise NameError('invalid strategy: {}'.format(strategy)) + raise NameError("invalid strategy: {}".format(strategy)) end_time = time.time() fitnesses = [str(p.fitness.values[0]) for p in pop] + logger.info("OPTIMIZATION STOP") + sys.stdout.flush() + + best_i = -1 + best_fitness = sys.float_info.max + for i in range(0, len(fitnesses)): + f = float(fitnesses[i]) + if f < best_fitness: + best_i = i + best_fitness = f + logger.info("BEST: %s == ...\n%s" % + (best_fitness, create_json_string(pop[i], indent=2))) + sys.stdout.flush() + eqpy.OUT_put("DONE") # return the final population - eqpy.OUT_put("{}\n{}\n{}\n{}\n{}".format(create_list_of_json_strings(pop), ';'.join(fitnesses), - start_time, log, end_time)) + eqpy.OUT_put("{}\n{}\n{}\n{}\n{}\n".format( + create_list_of_json_strings(pop), + ";".join(fitnesses), + start_time, + log, + end_time, + )) + + +def log_params(logger, num_iter, num_pop, seed): + logger.info("HPO PARAMS START") + logger.info("num_iter: %4i" % num_iter) + logger.info("num_pop: %4i" % num_pop) + logger.info("seed: %4i" % seed) + logger.info("HPO PARAMS STOP") diff --git a/workflows/common/python/dummy_baseline_keras2.py b/workflows/common/python/dummy_baseline_keras2.py index c0edfa5b..64b10a22 100644 --- a/workflows/common/python/dummy_baseline_keras2.py +++ b/workflows/common/python/dummy_baseline_keras2.py @@ -1,18 +1,22 @@ - # DUMMY BASELINE KERAS2 # To support workflow debugging + def initialize_parameters(): - return {} # empty dictionary + return {} # empty dictionary + class fake_history: + def __init__(self, x): - self.history = {"val_loss":[x]} + self.history = {"val_loss": [x]} + def run(params): print("RUNNING DUMMY: " + str(params)) import random - #value = float(len(str(params))) + random.random() + + # value = float(len(str(params))) + random.random() value = random.random() result = fake_history(value) return result diff --git a/workflows/common/python/ga_utils.py b/workflows/common/python/ga_utils.py index 2454dfde..f5bc4a3d 100644 --- a/workflows/common/python/ga_utils.py +++ b/workflows/common/python/ga_utils.py @@ -1,6 +1,10 @@ from __future__ import print_function -import random, json, sys, math +import json +import math +import random +import sys + def is_number(s): try: @@ -9,6 +13,7 @@ def is_number(s): except ValueError: return False + class ConstantParameter(object): def __init__(self, name, value): @@ -28,6 +33,7 @@ def parse(self, s): return int(s) return s + class NumericParameter(object): def __init__(self, name, lower, upper, sigma): @@ -56,6 +62,7 @@ def mutate(self, x, mu, indpb): def parse(self, s): return int(s) + class FloatParameter(NumericParameter): def __init__(self, name, lower, upper, sigma): @@ -71,9 +78,11 @@ def mutate(self, x, mu, indpb): def parse(self, s): return float(s) -#import logging -#logging.basicConfig() -#log = logging.getLogger("a") + +# import logging +# logging.basicConfig() +# log = logging.getLogger("a") + def str_to_bool(s): if s.lower() == "true": @@ -81,30 +90,35 @@ def str_to_bool(s): else: return False + class ListParameter(object): def __init__(self, name, categories, element_type): self.name = name self.categories = categories - if element_type == 'float': + if element_type == "float": self.parse_func = float - elif element_type == 'int': + elif element_type == "int": self.parse_func = int - elif element_type == 'string': + elif element_type == "string": self.parse_func = str - elif element_type == 'logical': + elif element_type == "logical": self.parse_func = str_to_bool else: - raise ValueError("Invalid type: {} - must be one of 'float', 'int', 'string', or 'logical'") + raise ValueError( + "Invalid type: {} - must be one of 'float', 'int', 'string', or 'logical'" + ) def parse(self, s): return self.parse_func(s) + class CategoricalParameter(ListParameter): def __init__(self, name, categories, element_type): - super(CategoricalParameter, self).__init__(name, categories, element_type) + super(CategoricalParameter, self).__init__(name, categories, + element_type) def randomDraw(self): i = random.randint(0, len(self.categories) - 1) @@ -119,6 +133,7 @@ def mutate(self, x, mu, indpb): x = a return x + class OrderedParameter(ListParameter): def __init__(self, name, categories, sigma, element_type): @@ -145,6 +160,7 @@ def mutate(self, x, mu, indpb): x = self.categories[n] return x + class LogicalParameter: def __init__(self, name): @@ -164,48 +180,51 @@ def parse(self, s): else: return False + def create_parameters(param_file, ignore_sigma=False): with open(param_file) as json_file: data = json.load(json_file) params = [] for item in data: - name = item['name'] - t = item['type'] + name = item["name"] + t = item["type"] if ignore_sigma: - sigma = float('nan') - if t == 'int' or t == 'float': - lower = item['lower'] - upper = item['upper'] + sigma = float("nan") + if t == "int" or t == "float": + lower = item["lower"] + upper = item["upper"] if not ignore_sigma: - sigma = item['sigma'] + sigma = item["sigma"] - if t == 'int': - params.append(IntParameter(name, int(lower), int(upper), - int(sigma))) + if t == "int": + params.append( + IntParameter(name, int(lower), int(upper), int(sigma))) else: - params.append(FloatParameter(name, float(lower), float(upper), - float(sigma))) + params.append( + FloatParameter(name, float(lower), float(upper), + float(sigma))) - elif t == 'categorical': - vs = item['values'] - element_type = item['element_type'] + elif t == "categorical": + vs = item["values"] + element_type = item["element_type"] params.append(CategoricalParameter(name, vs, element_type)) - elif t == 'logical': + elif t == "logical": params.append(LogicalParameter(name)) elif t == "ordered": - vs = item['values'] + vs = item["values"] if not ignore_sigma: - sigma = item['sigma'] - element_type = item['element_type'] + sigma = item["sigma"] + element_type = item["element_type"] params.append(OrderedParameter(name, vs, sigma, element_type)) - elif t == 'constant': - vs = item['value'] + elif t == "constant": + vs = item["value"] params.append(ConstantParameter(name, vs)) return params -if __name__ == '__main__': + +if __name__ == "__main__": create_parameters(sys.argv[1]) diff --git a/workflows/common/python/log_runner.py b/workflows/common/python/log_runner.py index dea00252..f23709ef 100644 --- a/workflows/common/python/log_runner.py +++ b/workflows/common/python/log_runner.py @@ -1,27 +1,32 @@ import sys + import exp_logger + def log_start(): parameter_map = {} - parameter_map['pp'] = sys.argv[2] - parameter_map['iterations'] = sys.argv[3] - parameter_map['params'] = "\"\"\"{}\"\"\"".format(sys.argv[4]) - parameter_map['algorithm'] = sys.argv[5] - parameter_map['experiment_id'] = sys.argv[6] - sys_env = "\"\"\"{}\"\"\"".format(sys.argv[7]) + parameter_map["pp"] = sys.argv[2] + parameter_map["iterations"] = sys.argv[3] + parameter_map["params"] = '"""{}"""'.format(sys.argv[4]) + parameter_map["algorithm"] = sys.argv[5] + parameter_map["experiment_id"] = sys.argv[6] + sys_env = '"""{}"""'.format(sys.argv[7]) exp_logger.start(parameter_map, sys_env) + def log_end(): exp_id = sys.argv[2] exp_logger.end(exp_id) + def main(): print(sys.argv) - if sys.argv[1] == 'start': + if sys.argv[1] == "start": log_start() else: log_end() -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/workflows/common/python/log_tools.py b/workflows/common/python/log_tools.py index 557fb5c0..aa28e8eb 100644 --- a/workflows/common/python/log_tools.py +++ b/workflows/common/python/log_tools.py @@ -1,20 +1,52 @@ - # LOG TOOLS # Standardize some Python logging techniques import sys -def get_logger(logger, name, stream=sys.stdout): - """ Set up logging """ +logger = None + + +def get_logger(logger, name, stream=sys.stdout, milliseconds=False): + """ + Set up logging if necessary + If the caller's logger already exists, just return it + """ if logger is not None: return logger import logging + logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) h = logging.StreamHandler(stream=stream) - fmtr = logging.Formatter('%(asctime)s %(name)s %(levelname)-6s %(message)s', - datefmt='%Y-%m-%d %H:%M:%S') + if not milliseconds: + fmtr = logging.Formatter("%(asctime)s %(name)s %(levelname)-5s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S") + else: + fmtr = logging.Formatter("%(asctime)s.%(msecs)03d %(name)s %(levelname)-5s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S") + h.setFormatter(fmtr) logger.addHandler(h) return logger + + +# def log(msg): +# global logger +# logger.info(msg) + +# def log_info(msg): +# global logger +# logger = get_logger(logger) +# logger.info(msg) + +# def debug(msg): +# global logger +# logger = get_logger(logger) +# logger.debug(msg) + + +def timestamp(): + from datetime import datetime + + return datetime.now().strftime("%Y-%m-%d %H:%M:%S") diff --git a/workflows/common/python/model_abstention_runner.py b/workflows/common/python/model_abstention_runner.py new file mode 100644 index 00000000..129f53fe --- /dev/null +++ b/workflows/common/python/model_abstention_runner.py @@ -0,0 +1,331 @@ +# MODEL RUNNER PY + +# See __main__ section for usage + +import importlib +import json +import math +import os +import sys +import time + +import log_tools +import numpy as np +import runner_utils +from runner_utils import ModelResult + +logger = None + +print("MODEL RUNNER...") + +# Andrew: Adding the following line (switching the order of the following two lines) in order to append an arbitrary model's dependencies to the path *before* the benchmarks in order to accidentally use a benchmark dependency +# append ${MODEL_PYTHON_DIR} to $PATH if variable is set +python_dir = os.getenv("MODEL_PYTHON_DIR") +if python_dir: + sys.path.append(python_dir) +# append ${BENCHMARKS_ROOT}/common to $PATH if variable is set +benchmarks_root = os.getenv("BENCHMARKS_ROOT") +if benchmarks_root: + sys.path.append(benchmarks_root + "/common") + +# import candle_lrn_crv + +print("sys.path:") +for i in range(0, len(sys.path) - 1): + print("%2i: %s" % (i, sys.path[i])) +print("") + + +def import_pkg(framework, model_name): + # The model_name is the short form of the Benchmark: e.g., 'nt3' + # The module_name is the name of the Python module: e.g., 'nt3_baseline_keras2' + print("model_name: ", model_name) + module_name = os.getenv("MODEL_PYTHON_SCRIPT") + if framework == "keras": + if module_name == None or module_name == "": + module_name = "{}_abstention_keras2".format(model_name) + print("module_name:", module_name) + pkg = importlib.import_module(module_name) + elif framework == "pytorch": + import torch + + if module_name == None or module_name == "": + module_name = "{}_baseline_pytorch".format(model_name) + print("module_name:", module_name) + pkg = importlib.import_module(module_name) + else: + raise ValueError("Framework must either be `keras' or `pytorch' " + + "got `{}'!".format(framework)) + + return pkg + + +def log(msg): + global logger + logger.debug(msg) + + +def timestamp(): + from datetime import datetime + + return datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + +def setup_perf(params): + return {"top": setup_perf_top(params), "nvidia": setup_perf_nvidia(params)} + + +def setup_perf_top(params): + if "perf_top" not in params: + return None + if params["perf_top"] == "0": + return None + try: + delay = int(params["perf_top"]) + except: + msg = ('setup_perf_top(): params[perf_top] not an int: got: "%s"' % + params["perf_top"]) + print(msg) + raise Exception(msg) + import subprocess + + with open("perf-top.log", "a") as fp_out: + fp_out.write("model_runner: start: %s\n\n" % timestamp()) + P = subprocess.Popen( + ["top", "-b", "-d", params["perf_top"]], + stdout=fp_out, + stderr=subprocess.STDOUT, + ) + return P + + +def setup_perf_nvidia(params): + if "perf_nvidia" not in params: + return None + if params["perf_nvidia"] == "0": + return None + try: + delay = int(params["perf_nvidia"]) + except: + msg = ("setup_perf_nvidia(): params[perf_nvidia] not an int: " + + 'got: "%s"' % params["perf_nvidia"]) + print(msg) + raise Exception(msg) + import subprocess + + with open("perf-nvidia.log", "a") as fp_out: + fp_out.write("model_runner: start: %s\n\n" % timestamp()) + P = subprocess.Popen( + ["nvidia-smi", "--loop=" + params["perf_top"]], + stdout=fp_out, + stderr=subprocess.STDOUT, + ) + return P + + +def stop_perf(Ps): + for s in ["top", "nvidia"]: + if Ps[s] is not None: + Ps[s].terminate() + + +def run(hyper_parameter_map, obj_return): + start = time.time() + global logger + logger = log_tools.get_logger(logger, "MODEL RUNNER") + + log("START:") + sys.stdout.flush() + + directory = hyper_parameter_map["instance_directory"] + os.chdir(directory) + + with open(directory + "/rank.txt", "w") as fp: + fp.write(str(os.getenv("ADLB_RANK_SELF")) + "\n") + + framework = hyper_parameter_map["framework"] + model_name = hyper_parameter_map["model_name"] + pkg = import_pkg(framework, model_name) + + runner_utils.format_params(hyper_parameter_map) + + params_arg = {} + if "config_file" in hyper_parameter_map: + config_file = hyper_parameter_map["config_file"] + logger.info('specified config_file: "%s"' % config_file) + params_arg = {"default_model": config_file} + + # params is a python dictionary + params = setup_params(pkg, hyper_parameter_map, params_arg) + + Ps = setup_perf(params) + + # Run the model! + history = pkg.run(params) + + if framework == "keras": + runner_utils.keras_clear_session(framework) + + # Default result if there is no val_loss (as in infer.py) + result = 0 + history_result = {} + if history != None: + result, history_result = get_results(history, obj_return) + + stop_perf(Ps) + + finish = time.time() + duration = finish - start + log(" DONE: run_id %s in %0.2f seconds." % + (hyper_parameter_map["run_id"], duration)) + return (result, history_result) + + +def get_obj_return(): + obj_return = os.getenv("OBJ_RETURN") + valid_obj_returns = ["loss", "val_loss", "val_corr", "val_acc"] + if obj_return == None: + raise Exception("No OBJ_RETURN was in the environment!") + if obj_return not in valid_obj_returns: + raise Exception("Invalid value for OBJ_RETURN: use: " + + str(valid_obj_returns)) + return obj_return + + +def load_pre_post(hyper_parameter_map, key): + module = None + if key in hyper_parameter_map: + module_name = hyper_parameter_map[key] + module = importlib.import_module(module_name) + return module + + +def run_pre(hyper_parameter_map): + module = load_pre_post(hyper_parameter_map, "pre_module") + result = ModelResult.SUCCESS + if module != None: + logger.debug("PRE RUN START") + result = module.pre_run(hyper_parameter_map) + logger.debug("PRE RUN STOP") + return result + + +def run_post(hyper_parameter_map, output_map): + module = load_pre_post(hyper_parameter_map, "post_module") + if module != None: + logger.debug("POST RUN START") + module.post_run(hyper_parameter_map, output_map) + logger.debug("POST RUN STOP") + + +def run_model(hyper_parameter_map): + instance_directory = hyper_parameter_map["instance_directory"] + os.chdir(instance_directory) + global logger + logger = log_tools.get_logger(logger, "MODEL RUNNER") + obj_return = get_obj_return() + result = run_pre(hyper_parameter_map) + if result == ModelResult.ERROR: + print("run_pre() returned ERROR!") + exit(1) + elif result == ModelResult.SKIP: + log("run_pre() returned SKIP ...") + sys.stdout.flush() + return ("SKIP", "HISTORY_EMPTY") + else: + assert result == ModelResult.SUCCESS # proceed... + + result, history = run(hyper_parameter_map, obj_return) + runner_utils.write_output(result, instance_directory) + runner_utils.write_output( + json.dumps(history, cls=runner_utils.FromNPEncoder), + instance_directory, + "history.txt", + ) + + run_post(hyper_parameter_map, {}) + log("RUN STOP") + return (result, history) + + +def setup_params(pkg, hyper_parameter_map, params_arg): + params = pkg.initialize_parameters(**params_arg) + log("PARAM UPDATE START") + for k, v in hyper_parameter_map.items(): + if k == "dense" or k == "dense_feature_layers": + if type(v) != list: + v = v.split(" ") + v = [int(i) for i in v] + if k == "cell_features": + cp_str = v + v = list() + v.append(cp_str) + log(str(k) + " = " + str(v)) + params[k] = v + log("PARAM UPDATE STOP") + + log("WRITE_PARAMS START") + runner_utils.write_params(params, hyper_parameter_map) + log("WRITE_PARAMS STOP") + return params + + +def get_results(history, obj_return): + """Return the history entry that the user requested. + + history: The Keras history object + """ + values = history.history[obj_return] + # Default: the last value in the history + result = values[-1] + + known_params = ["loss", "val_loss", "val_corr", "val_dice_coef"] + if obj_return not in known_params: + raise ValueError("Unsupported objective function: " + + "use obj_param to specify one of " + str(known_params)) + + # Fix NaNs: + if math.isnan(result): + if obj_return == "val_corr" or obj_return == "val_dice_coef": + # Return the negative result + result = -result + else: + # Just return a large number + result = 999999999 + + print("result: " + obj_return + ": " + str(result)) + history_result = history.history.copy() + return result, history_result + + +# Usage: see how sys.argv is unpacked below: +if __name__ == "__main__": + logger = log_tools.get_logger(logger, "MODEL_RUNNER") + log("RUN START") + + ( + _, # The Python program name (unused) + param_string, + instance_directory, + framework, + runid, + benchmark_timeout, + ) = sys.argv + + hyper_parameter_map = runner_utils.init(param_string, + instance_directory, + framework, + out_dir_key="save") + hyper_parameter_map["model_name"] = os.getenv("MODEL_NAME") + if hyper_parameter_map["model_name"] == None: + raise Exception("No MODEL_NAME was in the environment!") + hyper_parameter_map["experiment_id"] = os.getenv("EXPID") + hyper_parameter_map["run_id"] = runid + hyper_parameter_map["timeout"] = float(benchmark_timeout) + + # tensorflow.__init__ calls _os.path.basename(_sys.argv[0]) + # so we need to create a synthetic argv. + # if (not hasattr(sys, 'argv')) or (len(sys.argv) == 0): + # sys.argv = ['nt3_tc1'] + sys.argv = ["null"] + run_model(hyper_parameter_map) diff --git a/workflows/common/python/model_runner.py b/workflows/common/python/model_runner.py index a38d06ae..ae7fb83f 100644 --- a/workflows/common/python/model_runner.py +++ b/workflows/common/python/model_runner.py @@ -1,200 +1,252 @@ - # MODEL RUNNER PY # See __main__ section for usage -import sys +import importlib import json +import math import os +import sys import time -import numpy as np +import traceback import importlib import runner_utils +from log_tools import * from runner_utils import ModelResult -import log_tools -import math logger = None -print("MODEL RUNNER...") +print("MODEL RUNNER MODULE") +sys.stdout.flush() -# Andrew: Adding the following line (switching the order of the following two lines) in order to append an arbitrary model's dependencies to the path *before* the benchmarks in order to accidentally use a benchmark dependency -# append ${MODEL_PYTHON_DIR} to $PATH if variable is set +# Set PYTHONPATH: +# Let MODEL_PYTHON_DIR override default Benchmarks model locations python_dir = os.getenv("MODEL_PYTHON_DIR") if python_dir: sys.path.append(python_dir) -# append ${BENCHMARKS_ROOT}/common to $PATH if variable is set -benchmarks_root = os.getenv("BENCHMARKS_ROOT") -if benchmarks_root: - sys.path.append(benchmarks_root+"/common") -# import candle_lrn_crv +# This is for candle_lib, which is not in Benchmarks any more +# benchmarks_root = os.getenv("BENCHMARKS_ROOT") +# if benchmarks_root: +# sys.path.append(benchmarks_root+'/common') +# Report PYTHONPATH for debugging print("sys.path:") -for i in range(0, len(sys.path)-1): +for i in range(0, len(sys.path) - 1): print("%2i: %s" % (i, sys.path[i])) print("") + def import_pkg(framework, model_name): # The model_name is the short form of the Benchmark: e.g., 'nt3' - # The module_name is the name of the Python module: e.g., 'nt3_baseline_keras2' + # The module_name is the name of the Python module: + # e.g., 'nt3_baseline_keras2' print("model_name: ", model_name) module_name = os.getenv("MODEL_PYTHON_SCRIPT") - if framework == 'keras': - if module_name == None or module_name == "": + if framework == "keras": + if module_name is None or module_name == "": module_name = "{}_baseline_keras2".format(model_name) - print ("module_name:", module_name) + print("module_name: " + module_name) pkg = importlib.import_module(module_name) + elif framework == "pytorch": + import torch # noqa: F401 - # For Summit: - from tensorflow.keras import backend as K - # For other systems: - # from keras import backend as K - if K.backend() == 'tensorflow' and 'NUM_INTER_THREADS' in os.environ: - import tensorflow as tf - inter_threads = int(os.environ['NUM_INTER_THREADS']) - intra_threads = int(os.environ['NUM_INTRA_THREADS']) - print("Configuring tensorflow with {} inter threads and " + - "{} intra threads" - .format(inter_threads, intra_threads)) - cfg = tf.ConfigProto(inter_op_parallelism_threads=inter_threads, - intra_op_parallelism_threads=intra_threads) - sess = tf.Session(graph=tf.get_default_graph(), config=cfg) - K.set_session(sess) - elif framework == 'pytorch': - import torch - if module_name == None or module_name == "": + if module_name is None or module_name == "": module_name = "{}_baseline_pytorch".format(model_name) - print ("module_name:", module_name) + print("module_name: " + module_name) pkg = importlib.import_module(module_name) else: - raise ValueError("Framework must either be `keras' or `pytorch' " + - "got `{}'!".format(framework)) + raise ValueError('Framework must either be "keras" or "pytorch" ' + + 'got: "{}"'.format(framework)) return pkg +# TODO: Separate INFO and DEBUG messages def log(msg): global logger logger.debug(msg) + def timestamp(): from datetime import datetime + return datetime.now().strftime("%Y-%m-%d %H:%M:%S") + def setup_perf(params): - return { 'top': setup_perf_top(params), - 'nvidia': setup_perf_nvidia(params) } + return {"top": setup_perf_top(params), "nvidia": setup_perf_nvidia(params)} def setup_perf_top(params): - if 'perf_top' not in params: + if "perf_top" not in params: return None - if params['perf_top'] == '0': + if params["perf_top"] == "0": return None try: - delay = int(params['perf_top']) - except: - msg = 'setup_perf_top(): params[perf_top] not an int: got: "%s"' % \ - params['perf_top'] + delay = int(params["perf_top"]) + except Exception: + msg = ('setup_perf_top(): params[perf_top] not an int: got: "%s"' % + params["perf_top"]) print(msg) raise Exception(msg) import subprocess - with open('perf-top.log', 'a') as fp_out: - fp_out.write('model_runner: start: %s\n\n' % timestamp()) - P = subprocess.Popen(['top', '-b', '-d', params['perf_top']], + + with open("perf-top.log", "a") as fp_out: + fp_out.write("model_runner: start: %s\n\n" % timestamp()) + P = subprocess.Popen(["top", "-b", "-d", delay], stdout=fp_out, stderr=subprocess.STDOUT) return P + def setup_perf_nvidia(params): - if 'perf_nvidia' not in params: + if "perf_nvidia" not in params: return None - if params['perf_nvidia'] == '0': + if params["perf_nvidia"] == "0": return None try: - delay = int(params['perf_nvidia']) - except: - msg = 'setup_perf_nvidia(): params[perf_nvidia] not an int: ' + \ - 'got: "%s"' % params['perf_nvidia'] + delay = int(params["perf_nvidia"]) + except Exception: + msg = ("setup_perf_nvidia(): params[perf_nvidia] not an int: " + + 'got: "%s"' % params["perf_nvidia"]) print(msg) raise Exception(msg) import subprocess - with open('perf-nvidia.log', 'a') as fp_out: - fp_out.write('model_runner: start: %s\n\n' % timestamp()) - P = subprocess.Popen(['nvidia-smi', '--loop='+params['perf_top']], + + with open("perf-nvidia.log", "a") as fp_out: + fp_out.write("model_runner: start: %s\n\n" % timestamp()) + P = subprocess.Popen(["nvidia-smi", "--loop=%i" % delay], stdout=fp_out, stderr=subprocess.STDOUT) return P def stop_perf(Ps): - for s in ['top', 'nvidia']: + for s in ["top", "nvidia"]: if Ps[s] is not None: Ps[s].terminate() -def run(hyper_parameter_map, obj_return): +def run(hyper_parameter_map, model_return): start = time.time() global logger - logger = log_tools.get_logger(logger, 'MODEL RUNNER') + logger = get_logger(logger, "MODEL RUNNER") - log("START:") - sys.stdout.flush() + logger.info("run(): START:") + # sys.stdout.flush() - directory = hyper_parameter_map['instance_directory'] + directory = hyper_parameter_map[ + "instance_directory"] # should be output_dir os.chdir(directory) - with open(directory + '/rank.txt', 'w') as fp: - fp.write(str(os.getenv('ADLB_RANK_SELF')) + '\n') + with open(directory + "/rank.txt", "w") as fp: + fp.write(str(os.getenv("ADLB_RANK_SELF")) + "\n") framework = hyper_parameter_map['framework'] + print("framework: " + str(framework)) + # sys.stdout.flush() model_name = hyper_parameter_map['model_name'] pkg = import_pkg(framework, model_name) runner_utils.format_params(hyper_parameter_map) params_arg = {} - if 'config_file' in hyper_parameter_map: - config_file = hyper_parameter_map['config_file'] + if "CANDLE_DEFAULT_MODEL_FILE" in os.environ: + config_file = os.getenv("CANDLE_DEFAULT_MODEL_FILE") + logger.info('CANDLE_DEFAULT_MODEL_FILE: "%s"' % config_file) + params_arg = {"default_model": config_file} + if "config_file" in hyper_parameter_map: + config_file = hyper_parameter_map["config_file"] logger.info('specified config_file: "%s"' % config_file) - params_arg = { 'default_model': config_file } + params_arg = {"default_model": config_file} - # params is a python dictionary + # params is a Python dictionary params = setup_params(pkg, hyper_parameter_map, params_arg) Ps = setup_perf(params) - # Run the model! - history = pkg.run(params) + history = None + exception = False - if framework == 'keras': + # check for epochs if not present set to 1, + # used for checking early stopping in function get_results + if "epochs" in hyper_parameter_map: + epochs = hyper_parameter_map["epochs"] + else: + epochs = 1 + + log("PKG RUN START") + if framework == "keras": + + try: + # Run the model! + history = pkg.run(params) + except Exception as e: + logger.info("RUN EXCEPTION: " + str(e)) + print("RUN EXCEPTION: " + str(e)) + info = sys.exc_info() + s = traceback.format_tb(info[2]) + # This produces backslashes in output like "\n\n" + # on Frontier 2023-02-26 + # sys.stdout.write('\\n\\nEXCEPTION in model run(): \\n' + + # repr(e) + ' ... \\n' + ''.join(s)) + # sys.stdout.write('\\n') + sys.stdout.write('\n\nEXCEPTION in model run(): \n' + repr(e) + + ' ... \n' + ''.join(s)) + sys.stdout.write('\n') + sys.stdout.flush() + exception = True + exit(1) runner_utils.keras_clear_session(framework) - # Default result if there is no val_loss (as in infer.py) - result = 0 - history_result = {} - if history != None: - result, history_result = get_results(history, obj_return) + # Default result if there is no val_loss (as in infer.py) + result = 0 + history_result = {} + if not exception: + if history is not None: + if history == "EPOCHS_COMPLETED_ALREADY": + result, history_result = "EPOCHS_COMPLETED_ALREADY", None + else: + result, history_result = get_results( + history, model_return, epochs) + else: + result, history_result = "RUN_EXCEPTION", None - stop_perf(Ps) + elif framework == 'pytorch': + val_scores, infer_scores = pkg.run(params) + + class history: + + def __init__(self, val_scores): + self.history = {'val_loss': [val_scores['val_loss']]} + history = history(val_scores) + result, history_result = get_results(history, model_return, epochs) + + stop_perf(Ps) finish = time.time() duration = finish - start - log(" DONE: run_id %s in %0.2f seconds." % - (hyper_parameter_map["run_id"], duration)) + + # print the run_id and duration + logger.info("DONE: run_id %s in %0.2f seconds." % + (hyper_parameter_map["run_id"], duration)) + log("PKG RUN STOP") + sys.stdout.flush() + return (result, history_result) -def get_obj_return(): - obj_return = os.getenv("OBJ_RETURN") - valid_obj_returns = [ "loss", "val_loss", "val_corr", "val_acc" ] - if obj_return == None: - raise Exception("No OBJ_RETURN was in the environment!") - if obj_return not in valid_obj_returns: - raise Exception("Invalid value for OBJ_RETURN: use: " + - str(valid_obj_returns)) - return obj_return +def get_model_return(): + model_return = os.getenv("MODEL_RETURN") + valid_model_returns = ["loss", "val_loss", "val_corr", "val_acc"] + if model_return is None: + raise Exception("No MODEL_RETURN was in the environment!") + if model_return not in valid_model_returns: + raise Exception("Invalid value for MODEL_RETURN: use: " + + str(valid_model_returns)) + return model_return + def load_pre_post(hyper_parameter_map, key): module = None @@ -203,124 +255,159 @@ def load_pre_post(hyper_parameter_map, key): module = importlib.import_module(module_name) return module + def run_pre(hyper_parameter_map): - module = load_pre_post(hyper_parameter_map, 'pre_module') + module = load_pre_post(hyper_parameter_map, "pre_module") result = ModelResult.SUCCESS - if module != None: + if module is not None: logger.debug("PRE RUN START") result = module.pre_run(hyper_parameter_map) logger.debug("PRE RUN STOP") return result + def run_post(hyper_parameter_map, output_map): - module = load_pre_post(hyper_parameter_map, 'post_module') - if module != None: + module = load_pre_post(hyper_parameter_map, "post_module") + if module is not None: logger.debug("POST RUN START") module.post_run(hyper_parameter_map, output_map) logger.debug("POST RUN STOP") + def run_model(hyper_parameter_map): - instance_directory = hyper_parameter_map['instance_directory'] + # In-memory Python runs may not create sys.argv + if "argv" not in dir(sys): + # This is needed for CANDLE Benchmarks finalize_parameters(): + sys.argv = ["null"] + instance_directory = hyper_parameter_map["instance_directory"] os.chdir(instance_directory) global logger - logger = log_tools.get_logger(logger, "MODEL RUNNER") - obj_return = get_obj_return() + logger = get_logger(logger, "MODEL RUNNER") + model_return = get_model_return() + # logger.info("run_model: node: " + hyper_parameter_map['node']) + directory = hyper_parameter_map["instance_directory"] + os.chdir(directory) + if os.path.exists("stop.marker"): + logger.info("stop.marker exists!") + return ("SKIP", "STOP_MARKER") result = run_pre(hyper_parameter_map) if result == ModelResult.ERROR: print("run_pre() returned ERROR!") exit(1) elif result == ModelResult.SKIP: - log("run_pre() returned SKIP ...") + logger.info("run_pre() returned SKIP ...") + logger.info("model_runner: EXIT") sys.stdout.flush() + time.sleep(10) return ("SKIP", "HISTORY_EMPTY") else: - assert(result == ModelResult.SUCCESS) # proceed... - - result, history = run(hyper_parameter_map, obj_return) - runner_utils.write_output(result, instance_directory) - runner_utils.write_output(json.dumps(history, cls=runner_utils.FromNPEncoder), - instance_directory, 'history.txt') + assert result == ModelResult.SUCCESS # proceed... + result, history = run(hyper_parameter_map, model_return) + runner_utils.write_output(result, directory) + runner_utils.write_output( + json.dumps(history, cls=runner_utils.FromNPEncoder), directory, + "history.txt") run_post(hyper_parameter_map, {}) - log("RUN STOP") + logger.info("RUN STOP") return (result, history) + def setup_params(pkg, hyper_parameter_map, params_arg): params = pkg.initialize_parameters(**params_arg) - log("PARAM UPDATE START") - for k,v in hyper_parameter_map.items(): + logger.debug("PARAM UPDATE START") + for k, v in hyper_parameter_map.items(): if k == "dense" or k == "dense_feature_layers": - if(type(v) != list): + if type(v) != list: v = v.split(" ") v = [int(i) for i in v] if k == "cell_features": cp_str = v v = list() v.append(cp_str) - log(str(k) + " = " + str(v)) + logger.debug(str(k) + " = " + str(v)) params[k] = v - log("PARAM UPDATE STOP") + logger.debug("PARAM UPDATE STOP") - log("WRITE_PARAMS START") + logger.debug("WRITE_PARAMS START") runner_utils.write_params(params, hyper_parameter_map) - log("WRITE_PARAMS STOP") + logger.debug("WRITE_PARAMS STOP") return params -def get_results(history, obj_return): +def get_results(history, model_return, epochs_expected): """ - Return the history entry that the user requested. - history: The Keras history object + Return the history entry that the user requested via MODEL_RETURN, + which may be math.nan in case of error. + + Also checks for early stopping and if so marks the directory + with a 0-byte file named "stop.marker" + history: The TensorFlow history """ - values = history.history[obj_return] - # Default: the last value in the history - result = values[-1] - - known_params = [ "loss", "val_loss", "val_corr", "val_dice_coef" ] - if obj_return not in known_params: - raise ValueError("Unsupported objective function: " + - "use obj_param to specify one of " + + + logger.debug('get_results(): "%s"' % model_return) + + known_params = ["loss", "val_loss"] + + if model_return not in known_params: + raise ValueError("Unsupported objective function return " + 'key: "' + + model_return + '" - ' + + "use model_param to specify one of " + str(known_params)) - # Fix NaNs: - if math.isnan(result): - if obj_return == "val_corr" or obj_return == "val_dice_coef": - # Return the negative result - result = -result - else: - # Just return a large number - result = 999999999 + if model_return in history.history: + # Good value + values = history.history[model_return] + if len(values) < epochs_expected: + msg = "early stopping: %i/%i" % (len(values), epochs_expected) + logger.info("get_results(): " + msg) + with open("stop.marker", "w") as fp: + fp.write(msg + "\n") + print("VALUES: ", values, values[-1], type(values[-1])) + # Default: the last value in the history + result = float(values[-1]) + else: + logger.warning("get_results(): model return key " + + "not found: " + 'key: "' + model_return + '" - ' + + "history: " + str(history.history.keys())) + logger.warning("get_results(): returning NaN") + result = math.nan - print("result: " + obj_return + ": " + str(result)) + print("result: " + model_return + ": " + str(result)) history_result = history.history.copy() return result, history_result + # Usage: see how sys.argv is unpacked below: -if __name__ == '__main__': - logger = log_tools.get_logger(logger, "MODEL_RUNNER") - log("RUN START") +if __name__ == "__main__": + logger = get_logger(logger, "MODEL_RUNNER") + logger.info("main: RUN START") + + import sys - ( _, # The Python program name (unused) - param_string, - instance_directory, - framework, - runid, - benchmark_timeout ) = sys.argv + ( + _, # The Python program name (unused) + param_string, + instance_directory, + framework, + runid, + benchmark_timeout, + ) = sys.argv hyper_parameter_map = runner_utils.init(param_string, instance_directory, framework, - out_dir_key='save') - hyper_parameter_map['model_name'] = os.getenv("MODEL_NAME") - if hyper_parameter_map['model_name'] == None: + out_dir_key="save") + hyper_parameter_map["model_name"] = os.getenv("MODEL_NAME") + if hyper_parameter_map["model_name"] is None: raise Exception("No MODEL_NAME was in the environment!") - hyper_parameter_map['experiment_id'] = os.getenv("EXPID") - hyper_parameter_map['run_id'] = runid - hyper_parameter_map['timeout'] = float(benchmark_timeout) + hyper_parameter_map["experiment_id"] = os.getenv("EXPID") + hyper_parameter_map["run_id"] = runid + hyper_parameter_map["timeout"] = float(benchmark_timeout) # tensorflow.__init__ calls _os.path.basename(_sys.argv[0]) # so we need to create a synthetic argv. # if (not hasattr(sys, 'argv')) or (len(sys.argv) == 0): # sys.argv = ['nt3_tc1'] - sys.argv = ['null'] + sys.argv = ["null"] run_model(hyper_parameter_map) diff --git a/workflows/common/python/runner_utils.py b/workflows/common/python/runner_utils.py index 739a8d0b..6f111e1f 100644 --- a/workflows/common/python/runner_utils.py +++ b/workflows/common/python/runner_utils.py @@ -1,15 +1,26 @@ -import numpy as np -import json, os +import json +import os +import sys +import configparser + from enum import Enum +import numpy as np + try: - basestring + basestring except NameError: - basestring = str + basestring = str + +DATA_TYPES = { + type(np.float16): "f16", + type(np.float32): "f32", + type(np.float64): "f64" +} -DATA_TYPES = {type(np.float16): 'f16', type(np.float32): 'f32', type(np.float64): 'f64'} class FromNPEncoder(json.JSONEncoder): + def default(self, obj): if isinstance(obj, np.integer): return int(obj) @@ -20,24 +31,27 @@ def default(self, obj): else: return super(FromNPEncoder, self).default(obj) -def write_output(result, instance_directory, fname='result.txt'): - with open('{}/{}'.format(instance_directory, fname), 'w') as f_out: + +def write_output(result, instance_directory, fname="result.txt"): + with open("{}/{}".format(instance_directory, fname), "w") as f_out: f_out.write("{}\n".format(result)) + def init(param_string, instance_directory, framework, out_dir_key): - #with open(param_file) as f_in: + # with open(param_file) as f_in: # hyper_parameter_map = json.load(f_in) hyper_parameter_map = json.loads(param_string.strip()) if not os.path.exists(instance_directory): os.makedirs(instance_directory) - hyper_parameter_map['framework'] = framework - hyper_parameter_map[out_dir_key] = '{}/output'.format(instance_directory) - hyper_parameter_map['instance_directory'] = instance_directory + hyper_parameter_map["framework"] = framework + hyper_parameter_map[out_dir_key] = "{}/output".format(instance_directory) + hyper_parameter_map["instance_directory"] = instance_directory return hyper_parameter_map + def is_numeric(val): try: float(val) @@ -45,8 +59,9 @@ def is_numeric(val): except ValueError: return False + def format_params(hyper_parameter_map): - for k,v in hyper_parameter_map.items(): + for k, v in hyper_parameter_map.items(): vals = str(v).split(" ") if len(vals) > 1 and is_numeric(vals[0]): # assume this should be a list @@ -55,38 +70,116 @@ def format_params(hyper_parameter_map): else: hyper_parameter_map[k] = [int(x) for x in vals] + def write_params(params, hyper_parameter_map): - parent_dir = hyper_parameter_map['instance_directory'] if 'instance_directory' in hyper_parameter_map else '.' + parent_dir = (hyper_parameter_map["instance_directory"] + if "instance_directory" in hyper_parameter_map else ".") f = "{}/parameters.txt".format(parent_dir) - montr=[] # Monitor params + montr = [] # Monitor params with open(f, "w") as f_out: f_out.write("[Global Params]\n") - for k,v in params.items(): + for k, v in params.items(): if type(v) in DATA_TYPES: v = DATA_TYPES[type(v)] if isinstance(v, basestring): v = "'{}'".format(v) - if(k =='solr_root' or k == 'timeout' ): + if k == "solr_root" or k == "timeout": # this must written at the end - montr.append((k,v)) + montr.append((k, v)) else: f_out.write("{}={}\n".format(k, v)) f_out.write("[Monitor Params]\n") for kv in montr: f_out.write("{}={}\n".format(*kv)) + +def expand_params(params, hyper_parameter_map): + parent_dir = (hyper_parameter_map["instance_directory"] + if "instance_directory" in hyper_parameter_map else ".") + result = "" + for k, v in params.items(): + if type(v) in DATA_TYPES: + v = DATA_TYPES[type(v)] + if isinstance(v, basestring): + v = "{}".format(v) + if k == "solr_root" or k == "timeout" or k == "id": + # this must written at the end + pass # Not a command-line parameter + else: + result += "--{} {} ".format(k, v) + return result + + def keras_clear_session(framework): - if framework == 'keras': + if framework == "keras": # works around this error: # https://github.com/tensorflow/tensorflow/issues/3388 try: from tensorflow.keras import backend as K + K.clear_session() - except AttributeError: # theano does not have this function + except AttributeError: # theano does not have this function pass + class ModelResult(Enum): SUCCESS = 1 SKIP = 2 ERROR = 3 + + +def read_config_file_dict(file: str) -> dict: + result = {} + config = configparser.ConfigParser() + config.read(file) + + for section in config.sections(): + for k, v in config.items(section): + result[k] = eval(v) + return result + + +def merge_params(defaults, params): + result = defaults.copy() + for k, v in params.items(): + print("merge_params(): set " + str(k) + ' = ' + str(v)) + result[k] = v + return result + + +def main(): + # Need argparse + if sys.argv[1] == "write_params": + # Merge params from the user-provided params file and + # the workflow-generated parameters + # Parse the workflow-provided JSON string: + J = json.loads(sys.argv[2]) + # Assume we are in the correct directory for this file: + defaults = read_config_file_dict(sys.argv[3]) + params = merge_params(defaults, J) + print("params: " + str(params)) + write_params(params, {}) + elif sys.argv[1] == "expand_params": + # Merge params from the user-provided params file and + # the workflow-generated parameters and create + # a set of command line flags to pass to CANDLE parser_utils + if not (len(sys.argv) == 3 or len(sys.argv) == 4): + print("runner_utils: bad subcommand args: " + str(sys.argv)) + exit(1) + # Parse the workflow-provided JSON string: + params = json.loads(sys.argv[2]) + if len(sys.argv) == 3: + pass # No defaults, OK + elif len(sys.argv) == 4: + defaults = read_config_file_dict(sys.argv[3]) + params = merge_params(defaults, params) + params = expand_params(params, {}) + print(params) + else: + print("runner_utils: unknown subcommand: " + str(sys.argv)) + exit(1) + + +if __name__ == "__main__": + main() diff --git a/workflows/common/python/utils.py b/workflows/common/python/utils.py index f8a8b3ce..139eeedb 100644 --- a/workflows/common/python/utils.py +++ b/workflows/common/python/utils.py @@ -1,33 +1,119 @@ - # UTILS PY +import os + + def fail(*args): if len(args) == 1: fail1(args[0]) elif len(args) == 3: fail3(*args) + def fail1(message): - """ Fail with message, return exit code 1 """ + """Fail with message, return exit code 1.""" print(message) exit(1) + def fail3(e, code, message): - """ Fail with message due to Exception e , return exit code """ + """Fail with message due to Exception e , return exit code.""" print(message) print(str(e)) exit(code) + def avg(values): - total = 0.0 - for v in values: - total += v + total = sum(values) return total / len(values) + def append(filename, text): try: - with open(filename, 'a') as fp: + with open(filename, "a") as fp: fp.write(text) - fp.write('\n') + fp.write("\n") except Exception as e: - fail(e, os.EX_IOERR, 'Could not append to: ' + filename) + fail(e, os.EX_IOERR, "Could not append to: " + filename) + + +import re + + +class Matcher: + """Abstract class for use with Grepper.""" + + def __init__(self, regexp): + self.regexp = regexp + self.pattern = re.compile(self.regexp) + + def match(self, line): + m = self.pattern.match(line) + if m is None: + return None + self.run(line) + + def run(self, line): + """User code should override this.""" + pass + + def reset(self): + """User code should override this.""" + pass + + +class Grepper: + + def __init__(self, matchers): + """matchers: List of Matchers""" + self.matchers = matchers + + def grep(self, filename): + with open(filename, "r") as fp: + while True: + line = fp.readline() + if len(line) == 0: + break + for matcher in self.matchers: + matcher.match(line) + + def reset(self): + for matcher in self.matchers: + matcher.reset() + + +def columnPrint(D, aligns): + """D: a dict mapping a header string to a list of string data""" + """ aligns: a string "llrlr" for left or right alignment by column """ + headers = D.keys() + assert len(aligns) == len( + headers), "Length of aligns (%i) does not match headers (%i)!" % ( + len(aligns), + len(headers), + ) + + # Format specs for headers + fmth = "" + # Format specs for data + fmtd = "" + maxlist = 0 + index = 0 # To track aligns + for header in headers: + maxstr = len(header) + if len(D[header]) > maxlist: + maxlist = len(D[header]) + for item in D[header]: + if len(item) > maxstr: + maxstr = len(item) + # Header is always left-aligned + fmth += "%%-%is " % maxstr + sign = "-" if aligns[index] == "l" else "" + fmtd += "%%%s%is " % (sign, maxstr) + index += 1 + # Start printing + print(fmth % tuple(headers)) + for i in range(0, maxlist - 1): + L = [] + for header in headers: + L.append(D[header][i]) + print(fmtd % tuple(L)) diff --git a/workflows/common/sh/README.adoc b/workflows/common/sh/README.adoc index f39d3b1f..9acae0cd 100644 --- a/workflows/common/sh/README.adoc +++ b/workflows/common/sh/README.adoc @@ -5,23 +5,22 @@ This is the location for common shell scripting tools for the CANDLE Supervisor Workflows. -New developments with https://github.com/ECP-CANDLE/Supervisor/issues/20[Issue #20] are marke with *(#20)*. == Entry points -Each CANDLE workflow is invoked with a shell script, typically called +site_workflow.sh+, where the +site+ is +theta+, +titan+, etc. In this document we will call this the workflow_sh script. +Each CANDLE workflow is invoked with a shell script, typically called +test_*.sh site ...+, where the +site+ is +theta+, +summit+, etc. In this document we will call this the workflow_sh script. === Purpose The purpose of these scripts is: -. Determining the run directory, which is the +TURBINE_OUTPUT+ directory used by Swift/T. *(#20)* This is now set by +link:utils.sh:get_expid()+. - -. Set key environment variables for Python, R, etc. These include +PYTHONPATH+, +LD_LIBRARY_PATH+, etc. *(#20)* These are now set for each site in +langs-site.sh+. +. Determining the run directory, which is the +TURBINE_OUTPUT+ directory used by Swift/T. . Set key environment variables and Swift/T options for the system scheduler. These include +PROCS+, +WALLTIME+, +QUEUE+, +PROJECT+, etc. *(#20)* These are now set for each site in +sched-site.sh+. -. Loading modules. *(#20)* These are now set for each site in +modules-site.sh+. +. Loading modules. *(#20)* These are now set for each site in +env-site.sh+. + +. Set key environment variables for Python, R, etc. These include +PYTHONPATH+, +LD_LIBRARY_PATH+, etc. These are now set for each site in +env-site.sh+. . Determining which Swift functions to load. Swift/T may or may not be configured to use its efficient in-memory Python interpreter for the Benchmarks (because of challenges compiling against the site-provided Python plus deep learning stack). A shell variable called +SWIFT_IMPL+ denotes the Swift function implementation for the benchmarks. The value is one of: + @@ -39,17 +38,19 @@ The https://github.com/ECP-CANDLE/Supervisor/tree/master/workflows#objective-fun . Record what happened. This involves writing additional logs into the TURBINE_OUTPUT directory, particularly to capture settings and provenance that Swift/T cannot. -=== Future ideas +== Site list + +=== Summit + +`MED106=/gpfs/alpine/world-shared/med106` -Deduplication (https://github.com/ECP-CANDLE/Supervisor/issues/20[#20]). The current scripts flex the previously developed EMEWS templates, which make it easy to rapidly develop many workflows; additionally, they are highly linear, readable scripts. However, they duplicate a great deal of code, making Supervisor development more difficult. +==== `env-summit.sh` -. Source reusable settings. -.. Write key environment variables once for each system. E.g., https://github.com/ECP-CANDLE/Supervisor/blob/master/workflows/common/sh/langs-cori.sh[langs-cori.sh] -.. Same for scheduler settings, module settings. -. Create a new, site-generic workflow_sh for each workflow. This script will take the +site+ as an argument and source the appropriate settings files. -. Put application parameters in a separate file. This can be Bash-formatted for compatibility now, but may become something else. -. Install EQ/Py, EQ/R from Git and put in a common location, maintained by Wozniak. -. Test scripts. Each workflow directory will have a +test/+ subdirectory. This will contain enough bootstrapping code so that it can run and do something small without user configuration. This will build on the daily testing scripts that Brettin has started. +* GCC: 7.5.0 +* ROOT: `$MED106/sw/summit/gcc-7.5.0` +* Swift/T: `2022-04-12` +* Python: `/sw/summit/open-ce/anaconda-base/envs/open-ce-1.5.2-py39-0` +* R: `$ROOT/R/4.1.3` == Other shell tools diff --git a/workflows/common/sh/cfg-sys-summit-tf1.sh b/workflows/common/sh/cfg-sys-summit-tf1.sh new file mode 100644 index 00000000..529772c6 --- /dev/null +++ b/workflows/common/sh/cfg-sys-summit-tf1.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# UPF CFG SYS 1 + +# The number of MPI processes +# Note that 1 process is reserved for Swift/T +# For example, if PROCS=4 that gives you 3 workers, +# i.e., 3 concurrent Keras runs. +export PROCS=${PROCS:-2} + +# MPI processes per node. This should not exceed PROCS. +# Cori has 32 cores per node, 128GB per node +export PPN=${PPN:-1} + +#export QUEUE=${QUEUE:-batch} + +# Cori: (cf. sched-cori) +# export QUEUE=${QUEUE:-debug} +# Cori queues: debug, regular +# export QUEUE=regular +# export QUEUE=debug +# CANDLE on Cori: +# export PROJECT=m2924 + +# Theta: (cf. sched-theta) +# export QUEUE=${QUEUE:-debug-cache-quad} +#export QUEUE=${QUEUE:-debug-flat-quad} +# export PROJECT=${PROJECT:-ecp-testbed-01} +# export PROJECT=Candle_ECP +#export PROJECT=CSC249ADOA01 + +# Summit: +export QUEUE=${QUEUE:-batch} + +export PROJECT=${PROJECT:-med106} + +export WALLTIME=${WALLTIME:-0:30} + +# export MAIL_ENABLED=1 +# export MAIL_ADDRESS=wozniak@mcs.anl.gov + +# Benchmark run timeout: benchmark run will timeout +# after the specified number of seconds. -1 is no timeout. +BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} # probably not needed but this variable is baked into rest of code, e.g., workflow.sh + +# Uncomment below to use custom python script to run +# Use file name without .py (e.g, my_script.py) +# BENCHMARK_DIR=/path/to/ +# MODEL_PYTHON_SCRIPT=my_script + +# Shell timeout: benchmark run will be killed +# after the specified number of seconds. +# If set to -1 or empty there is no timeout. +# This timeout is implemented with the shell command 'timeout' +export SH_TIMEOUT=${SH_TIMEOUT:-} + +# Ignore errors: If 1, unknown errors will be reported to model.log +# but will not bring down the Swift workflow. See model.sh . +export IGNORE_ERRORS=0 diff --git a/workflows/common/sh/cfg-sys-summit-tf2.sh b/workflows/common/sh/cfg-sys-summit-tf2.sh new file mode 100644 index 00000000..529772c6 --- /dev/null +++ b/workflows/common/sh/cfg-sys-summit-tf2.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# UPF CFG SYS 1 + +# The number of MPI processes +# Note that 1 process is reserved for Swift/T +# For example, if PROCS=4 that gives you 3 workers, +# i.e., 3 concurrent Keras runs. +export PROCS=${PROCS:-2} + +# MPI processes per node. This should not exceed PROCS. +# Cori has 32 cores per node, 128GB per node +export PPN=${PPN:-1} + +#export QUEUE=${QUEUE:-batch} + +# Cori: (cf. sched-cori) +# export QUEUE=${QUEUE:-debug} +# Cori queues: debug, regular +# export QUEUE=regular +# export QUEUE=debug +# CANDLE on Cori: +# export PROJECT=m2924 + +# Theta: (cf. sched-theta) +# export QUEUE=${QUEUE:-debug-cache-quad} +#export QUEUE=${QUEUE:-debug-flat-quad} +# export PROJECT=${PROJECT:-ecp-testbed-01} +# export PROJECT=Candle_ECP +#export PROJECT=CSC249ADOA01 + +# Summit: +export QUEUE=${QUEUE:-batch} + +export PROJECT=${PROJECT:-med106} + +export WALLTIME=${WALLTIME:-0:30} + +# export MAIL_ENABLED=1 +# export MAIL_ADDRESS=wozniak@mcs.anl.gov + +# Benchmark run timeout: benchmark run will timeout +# after the specified number of seconds. -1 is no timeout. +BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} # probably not needed but this variable is baked into rest of code, e.g., workflow.sh + +# Uncomment below to use custom python script to run +# Use file name without .py (e.g, my_script.py) +# BENCHMARK_DIR=/path/to/ +# MODEL_PYTHON_SCRIPT=my_script + +# Shell timeout: benchmark run will be killed +# after the specified number of seconds. +# If set to -1 or empty there is no timeout. +# This timeout is implemented with the shell command 'timeout' +export SH_TIMEOUT=${SH_TIMEOUT:-} + +# Ignore errors: If 1, unknown errors will be reported to model.log +# but will not bring down the Swift workflow. See model.sh . +export IGNORE_ERRORS=0 diff --git a/workflows/common/sh/env-biowulf.sh b/workflows/common/sh/env-biowulf.sh index b9885481..b7be0362 100644 --- a/workflows/common/sh/env-biowulf.sh +++ b/workflows/common/sh/env-biowulf.sh @@ -1,56 +1,84 @@ #!/bin/bash -# Assume candle module is loaded as usual +# Note: It probably would make most sense to source site-specific_settings.sh here and then to use below the variables set in that file +# Prerequisite: Assume the candle module is loaded as usual +# This is a second test comment line -# Load the environments for each MPI implementation -if [ $USE_OPENMPI -eq 1 ]; then - #module load gcc/7.3.0 openmpi/3.1.2/cuda-9.0/gcc-7.3.0-pmi2 tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/1.8.0_181 # Note I had to stop using openmpi/3.1.2/cuda-9.0/gcc-7.3.0-pmi2 because at least as of 6/19/19 Biowulf seemed to stop supporting it (it was available only as a "hidden" module) - module load gcc/7.3.0 openmpi/3.1.3/cuda-9.2/gcc-7.3.0-pmi2 tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/1.8.0_181 - export OMPI_MCA_mpi_warn_on_fork=0 + +#### Set variables for CANDLE dependencies (mostly, Swift/T dependencies) ########################################################## +# This is for building CANDLE/Swift/T but it doesn't hurt to set these always +export CANDLE_DEP_MPI="/usr/local/OpenMPI/4.0.4/CUDA-10.2/gcc-9.2.0" +export CANDLE_DEP_TCL="/data/BIDS-HPC/public/software/builds/tcl" +export CANDLE_DEP_PY="/usr/local/Anaconda/envs/py3.7" +export CANDLE_DEP_R="/usr/local/apps/R/4.0/4.0.0/lib64/R" +export CANDLE_DEP_R_SITE="/usr/local/apps/R/4.0/site-library_4.0.0" +export CANDLE_DEP_ANT="/usr/local/apps/ant/1.10.3" +export CANDLE_LAUNCHER_OPTION="--with-launcher=/usr/local/slurm/bin/srun" +#################################################################################################################################### + + +#### Load the stack ################################################################################################################ +# Load the lmod environment modules +module load gcc/9.2.0 openmpi/4.0.4/cuda-10.2/gcc-9.2.0 ant/1.10.3 java/1.8.0_211 pcre2/10.21 GSL/2.6_gcc-9.2.0 + +# Load the Tcl we built on 9/12/20 +export PATH="/data/BIDS-HPC/public/software/builds/tcl/bin:$PATH" +export LD_LIBRARY_PATH="/data/BIDS-HPC/public/software/builds/tcl/lib:$LD_LIBRARY_PATH" +export MANPATH="/data/BIDS-HPC/public/software/builds/tcl/man:$MANPATH" + +# Load R/4.0.0 paths manually since we can't load the module on the Biowulf submit nodes (part of new stack on 8/13/20) +export PATH="$PATH:/usr/local/apps/R/4.0/4.0.0/bin" +export LIBRARY_PATH="$LIBRARY_PATH:/usr/local/intel/compilers_and_libraries_2019.1.144/linux/mkl/lib/intel64" +export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/intel/compilers_and_libraries_2019.1.144/linux/mkl/lib/intel64" +if [ -z ${R_LIBS_USER+x} ]; then + R_LIBS_USER="$HOME/R/%v/library" else - module load tcl_tk/8.6.8_gcc-7.2.0 ant/1.10.3 java/1.8.0_181 - module remove openmpi/3.0.2/gcc-7.3.0 - module load gcc/7.2.0 - export LD_LIBRARY_PATH=/usr/local/slurm/lib:$LD_LIBRARY_PATH - export PATH=/data/BIDS-HPC/public/software/builds/mpich-3.3-3/bin:$PATH - export LD_LIBRARY_PATH=/data/BIDS-HPC/public/software/builds/mpich-3.3-3/lib:$LD_LIBRARY_PATH - export LIBDIR=/data/BIDS-HPC/public/software/builds/mpich-3.3-3/lib:$LIBDIR - export CPATH=/data/BIDS-HPC/public/software/builds/mpich-3.3-3/include:$CPATH + R_LIBS_USER="$R_LIBS_USER:$HOME/R/%v/library" fi +export R_LIBS_SITE="$CANDLE_DEP_R_SITE" +export R_LIBS="$CANDLE/R/libs" +#################################################################################################################################### + + +#### Swift/T/MPI setup ############################################################################################################# +# Basic Swift/T settings +export SWIFT_T_INSTALL="$CANDLE/swift-t-install" +export PATH="$PATH:$SWIFT_T_INSTALL/stc/bin" +export PATH="$PATH:$SWIFT_T_INSTALL/turbine/bin" +export PYTHONPATH="$PYTHONPATH:$SWIFT_T_INSTALL/turbine/py" +export TURBINE_HOME="$SWIFT_T_INSTALL/turbine" +export TURBINE_LOG="1" +export ADLB_DEBUG_RANKS="1" +export ADLB_DEBUG_HOSTMAP="1" +export CANDLE_MODEL_IMPL="app" -# Load R/3.5.0 paths manually since we can't load the module on the Biowulf submit nodes -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/GSL/gcc-7.2.0/2.4/lib:/usr/local/geos/3.6.2/lib:/usr/local/intel/compilers_and_libraries_2018.1.163/linux/mkl/lib/intel64 -export PATH=$PATH:/usr/local/GSL/gcc-7.2.0/2.4/bin:/usr/local/apps/R/3.5/3.5.0_build2/bin -export R_LIBS_SITE=/usr/local/apps/R/3.5/site-library_build2 -export R_LIBS_USER=~/R/%v/library -export R_LIBS=$CANDLE/R/libs - -# Swift/T setup -export SWIFT_T_INSTALL=$CANDLE/swift-t-install -# NOTE: Below is 1 of 2 lines needed to run swift-t out-of-the-box -export PATH=$PATH:$SWIFT_T_INSTALL/stc/bin -export PATH=$PATH:$SWIFT_T_INSTALL/turbine/bin -export PYTHONPATH=$PYTHONPATH:$SWIFT_T_INSTALL/turbine/py -export TURBINE_HOME=$SWIFT_T_INSTALL/turbine -export TURBINE_LOG=1 -export ADLB_DEBUG_RANKS=1 -export ADLB_DEBUG_HOSTMAP=1 -export SWIFT_IMPL="app" # Resident task workers and ranks -if [ -z ${TURBINE_RESIDENT_WORK_WORKERS+x} ]; then - # Resident task workers and ranks - export TURBINE_RESIDENT_WORK_WORKERS=1 +if [ -z ${TURBINE_RESIDENT_WORK_WORKERS+x} ]; then # if $TURBINE_RESIDENT_WORK_WORKERS is unset... + export TURBINE_RESIDENT_WORK_WORKERS="1" export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) fi -# NOTE: Below is 2 of 2 lines needed to run swift-t out-of-the-box (no longer needed!!) -#export LD_PRELOAD=/usr/local/slurm/lib/libslurm.so:$LD_PRELOAD # this is the only way aside from recompiling Swift/T I believe to get past an error regarding /usr/local/slurm/lib/slurm/auth_munge.so, e.g., "/usr/local/Tcl_Tk/8.6.8/gcc_7.2.0/bin/tclsh8.6: symbol lookup error: /usr/local/slurm/lib/slurm/auth_munge.so: undefined symbol: slurm_debug" # Set up EMEWS Queues -export EQR=$CANDLE/Supervisor/workflows/common/ext/EQ-R # I don’t know where else to find this directory that needs to be available, e.g., in workflow.sh -export EQPy=$CANDLE/Supervisor/workflows/common/ext/EQ-Py +export EQR="$CANDLE/Supervisor/workflows/common/ext/EQ-R" +export EQPy="$CANDLE/Supervisor/workflows/common/ext/EQ-Py" + +# This is how Tim Miller told me to run interactive and batch MPI jobs on Biowulf GPU nodes recently (Aug/Sep 2020) +if [ "x${SLURM_JOB_PARTITION:-batch}" == "xinteractive" ]; then + export TURBINE_LAUNCH_OPTIONS+=" --mpi=pmix --mem=0" +else + export TURBINE_LAUNCH_OPTIONS+=" --mpi=pmix" +fi + +# This prevents PMIx errors I believe +export TURBINE_MPI_THREAD=0 # only currently used in Supervisor/workflows/upf/swift/workflow.sh +#################################################################################################################################### + -# Other additions -export PYTHONPATH=$PYTHONPATH:$CANDLE/Supervisor/workflows/common/python +#### Miscellaneous settings/output ################################################################################################# +# Add the Supervisor workflows scripts to the Python path +export PYTHONPATH="$PYTHONPATH:$CANDLE/Supervisor/workflows/common/python" # Log settings to output -which python swift-t +command -v python || echo "WARNING: Program 'python' not found" +command -v swift-t || echo "WARNING: Program 'swift-t' not found" +#################################################################################################################################### diff --git a/workflows/common/sh/env-cori.sh b/workflows/common/sh/env-cori.sh index 92a64f39..f9ee9323 100644 --- a/workflows/common/sh/env-cori.sh +++ b/workflows/common/sh/env-cori.sh @@ -16,7 +16,7 @@ SWIFT=/global/homes/w/wozniak/Public/sfw/compute/swift-t-2018-06-05 export PATH=$SWIFT/stc/bin:$PATH # On Cori, we have a good Swift/T Python embedded interpreter, # but we use app anyway -SWIFT_IMPL="app" +CANDLE_MODEL_IMPL="app" # Python PYTHON=/global/common/cori/software/python/2.7-anaconda/envs/deeplearning diff --git a/workflows/common/sh/env-crusher.sh b/workflows/common/sh/env-crusher.sh new file mode 100644 index 00000000..e40081e4 --- /dev/null +++ b/workflows/common/sh/env-crusher.sh @@ -0,0 +1,30 @@ + +# ENV Crusher + +# CANDLE_MODEL_IMPL=echo +CANDLE_MODEL_IMPL=py + +# CANDLE software installation root: +MED106=/gpfs/alpine/world-shared/med106 + +# Gounley installation: +ROOT=$MED106/gounley1/crusher2 +SWIFT=$ROOT/swift-t-install + +# Wozniak installation: +# ROOT=$MED106/sw/crusher/gcc-11.2.0 +# SWIFT=$ROOT/swift-t/2022-08-10 + +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH + +# Set up Python: +PY=/gpfs/alpine/med106/world-shared/gounley1/crusher2/conda520tf +export PYTHONHOME=$PY + +# For test output processing: +LOCAL=0 +CRAY=1 + +# Dummy setting: EQ/R is not installed on Spock yet +EQR=not-installed diff --git a/workflows/common/sh/env-default.sh b/workflows/common/sh/env-default.sh index 08f585bb..9bfae8e9 100644 --- a/workflows/common/sh/env-default.sh +++ b/workflows/common/sh/env-default.sh @@ -5,7 +5,7 @@ export PYTHONPATH=${EMEWS_PROJECT_ROOT}/python:${PYTHONPATH:-} -SWIFT_IMPL=app +CANDLE_MODEL_IMPL=app # Resident task workers and ranks if [ -z ${TURBINE_RESIDENT_WORK_WORKERS+x} ] @@ -15,3 +15,5 @@ then export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) fi +# This can be used for an OpenMPI hosts file +# export TURBINE_LAUNCH_OPTIONS="--hostfile $HOME/hosts.txt" diff --git a/workflows/common/sh/env-dunedin.sh b/workflows/common/sh/env-dunedin.sh index d2b320a8..57c4c958 100644 --- a/workflows/common/sh/env-dunedin.sh +++ b/workflows/common/sh/env-dunedin.sh @@ -4,7 +4,7 @@ # Assumes WORKFLOWS_ROOT, BENCHMARK_DIR, BENCHMARKS_ROOT are set # Python -PY=/home/wozniak/Public/sfw/anaconda3-tf +PY=/home/wozniak/Public/sfw/anaconda3 export PYTHONPATH=${PYTHONPATH:-}${PYTHONPATH:+:} PYTHONPATH+=$WORKFLOWS_ROOT/common/python: export PYTHONHOME=$PY @@ -16,7 +16,7 @@ export R_HOME=/home/wozniak/Public/sfw/R-3.5.3/lib/R # Swift/T export PATH=/home/wozniak/Public/sfw/swift-t/stc/bin:$PATH -SWIFT_IMPL="app" +CANDLE_MODEL_IMPL="app" # EMEWS Queues for R # EQR=/opt/EQ-R @@ -32,7 +32,8 @@ fi # LD_LIBRARY_PATH export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}${LD_LIBRARY_PATH:+:} -#LD_LIBRARY_PATH+=$R_HOME/lib +LD_LIBRARY_PATH+=$R_HOME/lib: +LD_LIBRARY_PATH+=$R_HOME/library/RInside/lib # LD_LIBRARY_PATH+=:/home/wozniak/Public/sfw/anaconda3/lib LD_LIBRARY_PATH=/usb2/wozniak/Public/sfw/R-3.5.3/lib/R/lib:$LD_LIBRARY_PATH show LD_LIBRARY_PATH diff --git a/workflows/common/sh/env-frontier.sh b/workflows/common/sh/env-frontier.sh new file mode 100644 index 00000000..47974aab --- /dev/null +++ b/workflows/common/sh/env-frontier.sh @@ -0,0 +1,28 @@ + +# ENV Frontier + +# CANDLE_MODEL_IMPL=echo +CANDLE_MODEL_IMPL=py + +# ROOT=/autofs/nccs-svm1_home1/wozniak/Public/sfw/frontier +# SWIFT=$ROOT/swift-t/2023-04-26 # Good + +ROOT=/lustre/orion/med106/world-shared/sfw +# SWIFT=$ROOT/swift-t/2023-05-08 # MPI-IO fix +SWIFT=$ROOT/swift-t/2023-05-10 # PMI SYNC + +export TURBINE_HOME=$SWIFT/turbine +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH + +PY=/lustre/orion/world-shared/med106/gounley1/conda543 +PATH=$PY/bin:$PATH + +# EMEWS Queues for R +# EQR=$ROOT/EQ-R + +# EQPy=$WORKFLOWS_ROOT/common/ext/EQ-Py + +# For test output processing: +LOCAL=0 +CRAY=1 diff --git a/workflows/common/sh/env-gce.sh b/workflows/common/sh/env-gce.sh new file mode 100644 index 00000000..f2f04e07 --- /dev/null +++ b/workflows/common/sh/env-gce.sh @@ -0,0 +1,34 @@ + +# ENV GCE +# Environment settings for ANL/GCE compute nodes + +SFW=/nfs/gce/projects/Swift-T/sfw/x86_64/U20 +# Python only: +# SWIFT=$SFW/swift-t/mpich/2022-11-14-Jenkins +# Python+R: +SWIFT=$SFW/swift-t/mpich/2022-11-14-Jenkins + +PATH=$SWIFT/stc/bin:$PATH + +PYTHON=/nfs/gce/globalscratch/jain/conda_installs/ +export PATH=$PYTHON/bin:$PATH +export PYTHONHOME=$PYTHON + +echo $SWIFT + +# Needed for Swift/T+R +export LD_LIBRARY_PATH=$SFW/R-4.1.0/lib/R/lib + +export PYTHONPATH=${PYTHONPATH:-} + +EQR=$SFW/EQ-R +CANDLE_MODEL_IMPL="app" + +# For test output processing: +export LOCAL=1 +export CRAY=0 + +# Cf. utils.sh +log_path PATH +log_path LD_LIBRARY_PATH +log_path PYTHONPATH diff --git a/workflows/common/sh/env-lambda.sh b/workflows/common/sh/env-lambda.sh new file mode 100644 index 00000000..c2cbb963 --- /dev/null +++ b/workflows/common/sh/env-lambda.sh @@ -0,0 +1,29 @@ + +# ENV Lambda +# Environment settings for Lambda (Swift, Python, R, Tcl, etc.) + +# Everything is installed in here: +SFW=/homes/woz/Public/sfw + +SWIFT=$SFW/swift-t/2022-11-02 +PY=$SFW/Anaconda +# EQPY=$SFW/EQ-Py +EQR=$SFW/EQ-R +R=$SFW/R-4.1.0 + +PATH=$SWIFT/stc/bin:$PATH +PATH=$PY/bin:$PATH + +export LD_LIBRARY_PATH=$R/lib/R/lib:${LD_LIBRARY_PATH:-} + +# How to run CANDLE models: +CANDLE_MODEL_IMPL="app" + +# PYTHONPATH=$EQPY/src:${PYTHONPATH:-} + +# Log settings to output +echo "Programs:" +which python swift-t | nl +# Cf. utils.sh +show PYTHONHOME +log_path LD_LIBRARY_PATH diff --git a/workflows/common/sh/env-lambda7.sh b/workflows/common/sh/env-lambda7.sh new file mode 100644 index 00000000..b779666c --- /dev/null +++ b/workflows/common/sh/env-lambda7.sh @@ -0,0 +1,29 @@ + +# ENV Lambda7 +# Environment settings for Lambda (Swift, Python, R, Tcl, etc.) + +# Everything is installed in here: +SFW=/homes/woz/Public/sfw + +SWIFT=$SFW/swift-t/2023-05-26 +PY=$SFW/Miniconda +# EQPY=$SFW/EQ-Py +export EQR=$SFW/EQ-R +R=$SFW/R-4.1.0 + +PATH=$SWIFT/stc/bin:$PATH +PATH=$PY/bin:$PATH + +export LD_LIBRARY_PATH=$R/lib/R/lib:${LD_LIBRARY_PATH:-} + +# How to run CANDLE models: +CANDLE_MODEL_IMPL="app" + +# PYTHONPATH=$EQPY/src:${PYTHONPATH:-} + +# Log settings to output +echo "Programs:" +which python swift-t | nl +# Cf. utils.sh +show PYTHONHOME +log_path LD_LIBRARY_PATH diff --git a/workflows/common/sh/env-local.sh b/workflows/common/sh/env-local.sh index c75b27d8..d7a9de09 100644 --- a/workflows/common/sh/env-local.sh +++ b/workflows/common/sh/env-local.sh @@ -4,14 +4,14 @@ # Assumes WORKFLOWS_ROOT, BENCHMARK_DIR, BENCHMARKS_ROOT are set # Modify to specify the location of SWIFT_T installation export SWIFT_T=${SWIFT_T:-$HOME/install/swift-t/} -export LD_LIBRARY_PATH+=$SWIFT_T/turbine/lib:$SWIFT_T/lb/lib:$SWIFT_T/cutils/lib +export LD_LIBRARY_PATH+=:$SWIFT_T/turbine/lib:$SWIFT_T/lb/lib:$SWIFT_T/cutils/lib:$SWIFT_T/stc/lib: # Python export PYTHONPATH=${PYTHONPATH:-}${PYTHONPATH:+:} PYTHONPATH+=$WORKFLOWS_ROOT/common/python: export PATH=$SWIFT_T/stc/bin:$PATH -SWIFT_IMPL="py" +CANDLE_MODEL_IMPL="py" # EMEWS Queues for R EQR=$WORKFLOWS_ROOT/common/ext/EQ-R diff --git a/workflows/common/sh/env-mbook.sh b/workflows/common/sh/env-mbook.sh new file mode 100644 index 00000000..fa2709fc --- /dev/null +++ b/workflows/common/sh/env-mbook.sh @@ -0,0 +1,44 @@ + +# ENV mbook +# Environment settings for mbook (Swift, Python, R, Tcl, etc.) + +# Everything is installed in here: +SFW=/Users/mbook/install/ + +SWIFT=$SFW/swift-t/ +PY=/opt/homebrew/anaconda3/envs/tensorflow/ +# EQPY=$SFW/EQ-Py +EQR=/Users/mbook/Supervisor/workflows/common/ext/EQ-R/ + +PATH=$SWIFT/stc/bin:$PATH +PATH=$PY/bin:$PATH + +export LD_LIBRARY_PATH=/Library/Frameworks/R.framework/Resources/lib/:${LD_LIBRARY_PATH:-} + +# How to run CANDLE models: +CANDLE_MODEL_IMPL="app" + +# PYTHONPATH=$EQPY/src:${PYTHONPATH:-} + +# Log settings to output +echo "Programs:" +which python swift-t | nl +# Cf. utils.sh +show PYTHONHOME + +### +export PYTHONHOME=$PY + +PYTHON="$PYTHONHOME/bin/" +export LD_LIBRARY_PATH="$PYTHONHOME/lib" +export PATH="$PYTHONHOME/bin:$PATH" + +COMMON_DIR=$EMEWS_PROJECT_ROOT/../common/python +PYTHONPATH+=":$PYTHONHOME/lib/:" +PYTHONPATH+=":$COMMON_DIR:" + +APP_PYTHONPATH=${APP_PYTHONPATH:-} +PYTHONPATH+=":$APP_PYTHONPATH" +### + +log_path LD_LIBRARY_PATH diff --git a/workflows/common/sh/env-mcs.sh b/workflows/common/sh/env-mcs.sh index d150d24e..07a27b31 100755 --- a/workflows/common/sh/env-mcs.sh +++ b/workflows/common/sh/env-mcs.sh @@ -1,12 +1,16 @@ -# LANGS LOCAL -# Language settings for any local machine like Ubuntu -# Assumes WORKFLOWS_ROOT, BENCHMARK_DIR, BENCHMARKS_ROOT are set +# ENV MCS +# Environment settings for ANL/MCS compute nodes + +# MCS CLUSTER IS GONE -- DELETE THIS ONCE JENKINS/GCE WORKS -- 2022-11-09 export PY=/homes/jain/anaconda3/bin/python/ export R=/home/wozniak/Public/sfw/x86_64/R-3.4.1/lib/R/ # Modify to specify the location of SWIFT_T installation -export SWIFT_T=${SWIFT_T:-/homes/jain/install/swift-t/} +# export SWIFT_T=${SWIFT_T:-/homes/jain/install/swift-t/} +export SWIFT_T=/homes/wozniak/Public/sfw/x86_64/swift-t/2020-06-04 + +export LD_LIBRARY_PATH+=/homes/jain/anaconda3/lib/:/nfs2/jain/spack/opt/spack/linux-ubuntu14-x86_64/gcc-4.4.7/cuda-8.0.61-kxwh3jwkxjybyo3n3nnajezfyq3epo5y/lib:/usr/lib export LD_LIBRARY_PATH+=:$R/lib:$SWIFT_T/stc/lib:$SWIFT_T/turbine/lib/:$SWIFT_T/lb/lib:$SWIFT_T/cutils/lib # Python @@ -15,10 +19,11 @@ PYTHONPATH+=$WORKFLOWS_ROOT/common/python: export PATH=$SWIFT_T/turbine/bin:$SWIFT_T/stc/bin:$PATH echo $PATH -SWIFT_IMPL="app" +CANDLE_MODEL_IMPL="py" # EMEWS Queues for R -EQR=$WORKFLOWS_ROOT/common/ext/EQ-R +# EQR=$WORKFLOWS_ROOT/common/ext/EQ-R +EQR=/home/wozniak/Public/sfw/x86_64/EQ-R EQPy=$WORKFLOWS_ROOT/common/ext/EQ-Py # Resident task workers and ranks if [ -z ${TURBINE_RESIDENT_WORK_WORKERS+x} ] @@ -36,6 +41,6 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}${LD_LIBRARY_PATH:+:} export LOCAL=1 export CRAY=0 -# Cf. utils.s +# Cf. utils.sh log_path LD_LIBRARY_PATH log_path PYTHONPATH diff --git a/workflows/common/sh/env-pascal.sh b/workflows/common/sh/env-pascal.sh index 11f1d710..22583292 100644 --- a/workflows/common/sh/env-pascal.sh +++ b/workflows/common/sh/env-pascal.sh @@ -5,7 +5,7 @@ if [ -z "$SUPERVISOR_HOME" ]; then echo "SUPERVISOR_HOME is blank"; else echo "SUPERVISOR_HOME is set to '$SUPERVISOR_HOME'"; fi source ${SUPERVISOR_HOME}/spack/loads -SWIFT_IMPL=app +CANDLE_MODEL_IMPL=app # EMEWS Queues for R EQR=$(spack location -i eqr) diff --git a/workflows/common/sh/env-polaris.sh b/workflows/common/sh/env-polaris.sh new file mode 100644 index 00000000..98eac55a --- /dev/null +++ b/workflows/common/sh/env-polaris.sh @@ -0,0 +1,21 @@ + +# ENV Polaris + +# CANDLE_MODEL_IMPL=echo +CANDLE_MODEL_IMPL=app + +CSC249=/lus/grand/projects/CSC249ADOA01 +ROOT=$CSC249/public/sfw/polaris +SWIFT=$ROOT/swift-t/2023-06-05 + +export TURBINE_HOME=$SWIFT/turbine +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH + +PY=$ROOT/Miniconda +PATH=$PY/bin:$PATH + +R_HOME=$ROOT/R-4.2.2/lib64/R +EQR=$ROOT/EQ-R + +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:$R_HOME/lib diff --git a/workflows/common/sh/env-spack.sh b/workflows/common/sh/env-spack.sh index 1ac6d916..c70f9972 100644 --- a/workflows/common/sh/env-spack.sh +++ b/workflows/common/sh/env-spack.sh @@ -34,7 +34,7 @@ fi TURBINE_PY=$( readlink --canonicalize $TURBINE_PY_LIB/.. ) PATH=$TURBINE_PY/bin:$PATH -SWIFT_IMPL="app" +CANDLE_MODEL_IMPL="app" # # EMEWS Queues for R # EQR=$( spack find -p eqr | sed -n 's/eqr@[.0-9]*//p' ) diff --git a/workflows/common/sh/env-spock.sh b/workflows/common/sh/env-spock.sh new file mode 100644 index 00000000..e9136d4a --- /dev/null +++ b/workflows/common/sh/env-spock.sh @@ -0,0 +1,26 @@ + +# ENV Spock + +# CANDLE_MODEL_IMPL=echo +CANDLE_MODEL_IMPL=py + +# CANDLE software installation root: +MED106=/gpfs/alpine/world-shared/med106 +# ROOT=$MED106/sw/spock/gcc-10.3.0 +ROOT=$MED106/sw/spock/gcc-11.2.0 + +# Add Swift/T to PATH +SWIFT=$ROOT/swift-t/2021-11-14 +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH + +# Set up Python: +PY=/gpfs/alpine/med106/world-shared/hsyoo/spock_tf2_py37_rocm42 +export PYTHONHOME=$PY + +# For test output processing: +LOCAL=0 +CRAY=1 + +# Dummy setting: EQ/R is not installed on Spock yet +EQR=not-installed diff --git a/workflows/common/sh/env-summit-i.sh b/workflows/common/sh/env-summit-i.sh new file mode 100644 index 00000000..3b604156 --- /dev/null +++ b/workflows/common/sh/env-summit-i.sh @@ -0,0 +1,76 @@ + +# ENV Summit Interactive +# Environment settings for Summit (Swift, Python, R, Tcl, etc.) +# GCC 6.4.0, TensorFlow 1, condaenv-200408, R 3.6.1 + +# CANDLE_MODEL_IMPL=echo +CANDLE_MODEL_IMPL=py + +# Let modules initialize LD_LIBRARY_PATH before changing it: +set +eu # modules create errors outside our control +module load spectrum-mpi/10.3.1.2-20200121 +module unload darshan-runtime +# module load ibm-wml-ce/1.6.2-3 +module list +set -eu + +# From Wozniak +MED106=/gpfs/alpine/world-shared/med106 +# SWIFT=$MED106/sw/gcc-7.4.0/swift-t/2019-10-18 # Python (ibm-wml), no R +# SWIFT=$MED106/sw/gcc-7.4.0/swift-t/2019-11-06 # Python (ibm-wml) and R +# Python (ibm-wml-ce/1.7.0-1) and R: +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-03-31-c +# Python (ibm-wml-ce/1.6.2-3) and R: +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-02 +# Python (med106/sw/condaenv-200408) and R: +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-08 +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-11 +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-08-19 +SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-09-02 + +export TURBINE_HOME=$SWIFT/turbine +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH + +# log_path PATH + +# IBM_WML_CE=/autofs/nccs-svm1_sw/summit/ibm-wml-ce/anaconda-base/envs/ibm-wml-ce-1.6.2-3 + +# export LD_LIBRARY_PATH +# LD_LIBRARY_PATH=$IBM_WML_CE/lib:$LD_LIBRARY_PATH + +# Inject Python to PATH using PRELAUNCH: +# This would be better, but is broken for ZSH users: +# module load ibm-wml-ce/1.6.2-3 +# Must use PATH directly: +# export TURBINE_PRELAUNCH="PATH=$IBM_WML_CE/bin:\$PATH" + +R=/gpfs/alpine/world-shared/med106/wozniak/sw/gcc-6.4.0/R-3.6.1/lib64/R +LD_LIBRARY_PATH+=:$R/lib + +PY=/gpfs/alpine/world-shared/med106/sw/condaenv-200408 +# LD_LIBRARY_PATH+=:$PY/lib +export PYTHONHOME=$PY + +PATH=$PY/bin:$PATH + +export LD_LIBRARY_PATH=$PY/lib:$LD_LIBRARY_PATH + +# EMEWS Queues for R +EQR=$MED106/wozniak/sw/gcc-6.4.0/EQ-R +EQPy=$WORKFLOWS_ROOT/common/ext/EQ-Py + +# For test output processing: +LOCAL=0 +CRAY=1 + +# Resident task worker count and rank list +# If this is already set, we respect the user settings +# If this is unset, we set it to 1 +# and run the algorithm on the 2nd highest rank +# This value is only read in HPO workflows +if [[ ${TURBINE_RESIDENT_WORK_WORKERS:-} == "" ]] +then + export TURBINE_RESIDENT_WORK_WORKERS=1 + export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) +fi diff --git a/workflows/common/sh/env-summit-login.sh b/workflows/common/sh/env-summit-login.sh index df8033cc..83ad938a 100644 --- a/workflows/common/sh/env-summit-login.sh +++ b/workflows/common/sh/env-summit-login.sh @@ -2,15 +2,18 @@ # ENV SUMMIT LOGIN # Environment settings for Summit login node (Swift, Python, R, Tcl, etc.) -# SWIFT_IMPL=app -SWIFT_IMPL=py +CANDLE_MODEL_IMPL=echo +# CANDLE_MODEL_IMPL=app +# CANDLE_MODEL_IMPL=py # Load basic LD_LIBRARY_PATH before changing it: -module load gcc/7.4.0 -module load ibm-wml +# module load gcc/7.4.0 +module load gcc/6.4.0 +# module load ibm-wml module unload darshan-runtime module unload spectrum-mpi -module load gcc/7.4.0 +# module load gcc/7.4.0 +module load gcc/6.4.0 module list @@ -18,10 +21,12 @@ log_path PATH # From Wozniak MED106=/gpfs/alpine/world-shared/med106 -SWIFT=$MED106/sw/login/gcc-7.4.0/swift-t/2019-10-22 # Python (ibm-wml), no R - +# SWIFT=$MED106/sw/login/gcc-7.4.0/swift-t/2019-10-22 # Python (ibm-wml), no R +SWIFT=$MED106/wozniak/sw/login/gcc-6.4.0/swift-t/2020-10-22 # (opence010env) +# MPICH=$MED106/sw/login/gcc-7.4.0/mpich-3.2.1/bin +MPICH=$MED106/sw/login/gcc-6.4.0/mpich-3.2.1 PATH=$SWIFT/stc/bin:$PATH -PATH=$MED106/sw/login/gcc-7.4.0/mpich-3.2.1/bin:$PATH +PATH=$MPICH/bin:$PATH # log_path PATH @@ -45,7 +50,7 @@ LOCAL=0 CRAY=1 # Resident task workers and ranks -if [ -z ${TURBINE_RESIDENT_WORK_WORKERS+x} ] +if [[ ${TURBINE_RESIDENT_WORK_WORKERS:-} == "" ]] then # Resident task workers and ranks export TURBINE_RESIDENT_WORK_WORKERS=1 diff --git a/workflows/common/sh/env-summit-tf-2.4.1.sh b/workflows/common/sh/env-summit-tf-2.4.1.sh new file mode 100644 index 00000000..c93df7fa --- /dev/null +++ b/workflows/common/sh/env-summit-tf-2.4.1.sh @@ -0,0 +1,50 @@ + +# ENV Summit - TF 2.4.1 +# Environment settings for Summit (Swift, Python, R, Tcl, etc.) + +# CANDLE_MODEL_IMPL=echo +CANDLE_MODEL_IMPL=py + +# Let modules initialize LD_LIBRARY_PATH before changing it: +set +eu # modules create errors outside our control +module load spectrum-mpi/10.3.1.2-20200121 +module unload darshan-runtime +module load gcc/7.4.0 +module list +set -eu + +# Base project directory +MED106=/gpfs/alpine/world-shared/med106 + +# Swift/T location +SWIFT=$MED106/sw/gcc-7.4.0/swift-t/2021-07-28 +export TURBINE_HOME=$SWIFT/turbine +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH + +# R settings +R=$MED106/wozniak/sw/gcc-6.4.0/R-3.6.1/lib64/R +LD_LIBRARY_PATH+=:$R/lib +# EMEWS Queues for R +EQR=$MED106/wozniak/sw/gcc-6.4.0/EQ-R + +# Python settings +PY=$MED106/sw/open-ce-1.1.3-py37 +LD_LIBRARY_PATH+=:$PY/lib +export PYTHONHOME=$PY +PATH=$PY/bin:$PATH + +# For test output processing: +LOCAL=0 +CRAY=1 + +# Resident task worker count and rank list +# If this is already set, we respect the user settings +# If this is unset, we set it to 1 +# and run the algorithm on the 2nd highest rank +# This value is only read in HPO workflows +if [[ ${TURBINE_RESIDENT_WORK_WORKERS:-} == "" ]] +then + export TURBINE_RESIDENT_WORK_WORKERS=1 + export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) +fi diff --git a/workflows/common/sh/env-summit-tf1.sh b/workflows/common/sh/env-summit-tf1.sh new file mode 100644 index 00000000..39468958 --- /dev/null +++ b/workflows/common/sh/env-summit-tf1.sh @@ -0,0 +1,75 @@ + +# DEPRECATED 2021-10-01: Use env-summit +# ENV Summit TF1 +# Environment settings for Summit (Swift, Python, R, Tcl, etc.) +# GCC 6.4.0, TensorFlow 1, condaenv-200408, R 3.6.1 + +# CANDLE_MODEL_IMPL=echo +CANDLE_MODEL_IMPL=py + +# Let modules initialize LD_LIBRARY_PATH before changing it: +set +eu # modules create errors outside our control +module load spectrum-mpi/10.3.1.2-20200121 +module unload darshan-runtime +# module load ibm-wml-ce/1.6.2-3 +module list +set -eu + +# From Wozniak +MED106=/gpfs/alpine/world-shared/med106 +# SWIFT=$MED106/sw/gcc-7.4.0/swift-t/2019-10-18 # Python (ibm-wml), no R +# SWIFT=$MED106/sw/gcc-7.4.0/swift-t/2019-11-06 # Python (ibm-wml) and R +# Python (ibm-wml-ce/1.7.0-1) and R: +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-03-31-c +# Python (ibm-wml-ce/1.6.2-3) and R: +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-02 +# Python (med106/sw/condaenv-200408) and R: +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-08 +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-11 +# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-08-19 +SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-09-02 + +export TURBINE_HOME=$SWIFT/turbine +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH + +# log_path PATH + +# IBM_WML_CE=/autofs/nccs-svm1_sw/summit/ibm-wml-ce/anaconda-base/envs/ibm-wml-ce-1.6.2-3 + +# export LD_LIBRARY_PATH +# LD_LIBRARY_PATH=$IBM_WML_CE/lib:$LD_LIBRARY_PATH + +# Inject Python to PATH using PRELAUNCH: +# This would be better, but is broken for ZSH users: +# module load ibm-wml-ce/1.6.2-3 +# Must use PATH directly: +# export TURBINE_PRELAUNCH="PATH=$IBM_WML_CE/bin:\$PATH" + +R=/gpfs/alpine/world-shared/med106/wozniak/sw/gcc-6.4.0/R-3.6.1/lib64/R +LD_LIBRARY_PATH+=:$R/lib + +PY=/gpfs/alpine/world-shared/med106/sw/condaenv-200408 +LD_LIBRARY_PATH+=:$PY/lib +export PYTHONHOME=$PY + +export LD_LIBRARY_PATH=/gpfs/alpine/world-shared/med106/sw/condaenv-200408/lib:$LD_LIBRARY_PATH + +# EMEWS Queues for R +EQR=$MED106/wozniak/sw/gcc-6.4.0/EQ-R +EQPy=$WORKFLOWS_ROOT/common/ext/EQ-Py + +# For test output processing: +LOCAL=0 +CRAY=1 + +# Resident task worker count and rank list +# If this is already set, we respect the user settings +# If this is unset, we set it to 1 +# and run the algorithm on the 2nd highest rank +# This value is only read in HPO workflows +if [[ ${TURBINE_RESIDENT_WORK_WORKERS:-} == "" ]] +then + export TURBINE_RESIDENT_WORK_WORKERS=1 + export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) +fi diff --git a/workflows/common/sh/env-summit-tf2.sh b/workflows/common/sh/env-summit-tf2.sh new file mode 100644 index 00000000..f469053d --- /dev/null +++ b/workflows/common/sh/env-summit-tf2.sh @@ -0,0 +1,48 @@ + +# DEPRECATED 2021-10-01: Use env-summit +# ENV Summit TF2 +# Environment settings for Summit (Swift, Python, R, Tcl, etc.) +# GCC 8.3.1, TensorFlow 2.4.1, opence 1.2.0-py38-0, R 3.6.1 + +CANDLE_MODEL_IMPL=py + +# Let modules initialize LD_LIBRARY_PATH before changing it: +set +eu # modules create errors outside our control +module load spectrum-mpi/10.4.0.3-20210112 +module unload darshan-runtime +module load open-ce/1.2.0-py38-0 +module list +set -eu + +# From Wozniak +MED106=/gpfs/alpine/world-shared/med106 +ROOT=$MED106/sw/gcc-8.3.1 +SWIFT=$ROOT/swift-t/2021-08-27 + +PATH=$SWIFT/stc/bin:$PATH +PATH=$SWIFT/turbine/bin:$PATH + +R=/gpfs/alpine/world-shared/med106/wozniak/sw/gcc-6.4.0/R-3.6.1/lib64/R +LD_LIBRARY_PATH+=:$R/lib + +PYTHON=$( which python3 ) +export PYTHONHOME=$( dirname $( dirname $PYTHON ) ) + +# EMEWS Queues for R +EQR=$MED106/wozniak/sw/gcc-6.4.0/EQ-R +EQPy=$WORKFLOWS_ROOT/common/ext/EQ-Py + +# For test output processing: +LOCAL=0 +CRAY=1 + +# Resident task worker count and rank list +# If this is already set, we respect the user settings +# If this is unset, we set it to 1 +# and run the algorithm on the 2nd highest rank +# This value is only read in HPO workflows +if [[ ${TURBINE_RESIDENT_WORK_WORKERS:-} == "" ]] +then + export TURBINE_RESIDENT_WORK_WORKERS=1 + export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) +fi diff --git a/workflows/common/sh/env-summit.sh b/workflows/common/sh/env-summit.sh index 59814139..bb9ee199 100644 --- a/workflows/common/sh/env-summit.sh +++ b/workflows/common/sh/env-summit.sh @@ -1,71 +1,45 @@ # ENV Summit -# Environment settings for Summit (Swift, Python, R, Tcl, etc.) -# SWIFT_IMPL=echo -SWIFT_IMPL=py +# CANDLE_MODEL_IMPL=echo +CANDLE_MODEL_IMPL=py # Let modules initialize LD_LIBRARY_PATH before changing it: set +eu # modules create errors outside our control -module load spectrum-mpi/10.3.1.2-20200121 +module load spectrum-mpi module unload darshan-runtime -# module load ibm-wml-ce/1.6.2-3 module list set -eu # From Wozniak MED106=/gpfs/alpine/world-shared/med106 -# SWIFT=$MED106/sw/gcc-7.4.0/swift-t/2019-10-18 # Python (ibm-wml), no R -# SWIFT=$MED106/sw/gcc-7.4.0/swift-t/2019-11-06 # Python (ibm-wml) and R -# Python (ibm-wml-ce/1.7.0-1) and R: -# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-03-31-c -# Python (ibm-wml-ce/1.6.2-3) and R: -# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-02 -# Python (med106/sw/condaenv-200408) and R: -# SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-08 -SWIFT=$MED106/wozniak/sw/gcc-6.4.0/swift-t/2020-04-11 +ROOT=$MED106/sw/summit/gcc-7.5.0 +# SWIFT=$ROOT/swift-t/2022-07-25 # Works +SWIFT=$ROOT/swift-t/m39-2022-09-27 export TURBINE_HOME=$SWIFT/turbine PATH=$SWIFT/stc/bin:$PATH PATH=$SWIFT/turbine/bin:$PATH -# log_path PATH - -# IBM_WML_CE=/autofs/nccs-svm1_sw/summit/ibm-wml-ce/anaconda-base/envs/ibm-wml-ce-1.6.2-3 - -# export LD_LIBRARY_PATH -# LD_LIBRARY_PATH=$IBM_WML_CE/lib:$LD_LIBRARY_PATH - -# Inject Python to PATH using PRELAUNCH: -# This would be better, but is broken for ZSH users: -# module load ibm-wml-ce/1.6.2-3 -# Must use PATH directly: -# export TURBINE_PRELAUNCH="PATH=$IBM_WML_CE/bin:\$PATH" - -R=/gpfs/alpine/world-shared/med106/wozniak/sw/gcc-6.4.0/R-3.6.1/lib64/R +R=$ROOT/R/4.1.3/lib64/R LD_LIBRARY_PATH+=:$R/lib -PY=/gpfs/alpine/world-shared/med106/sw/condaenv-200408 +# PY=/gpfs/alpine/world-shared/med106/sw/conda/2021-10-06/envs/CANDLE-2021-10-06 +# PY=/sw/summit/open-ce/anaconda-base/envs/open-ce-1.5.2-py39-0 +# PY=/gpfs/alpine/world-shared/med106/sw/open-ce-1.1.3-py37/ +PY=/gpfs/alpine/world-shared/med106/sw/conda/m-39-2022-09-15 LD_LIBRARY_PATH+=:$PY/lib export PYTHONHOME=$PY +PATH=$PY/bin:$PATH -export LD_LIBRARY_PATH=/gpfs/alpine/world-shared/med106/sw/condaenv-200408/lib:$LD_LIBRARY_PATH +# /gpfs/alpine/world-shared/med106/sw/condaenv-200408 +export LD_LIBRARY_PATH=$PY/lib:$LD_LIBRARY_PATH # EMEWS Queues for R -EQR=$MED106/wozniak/sw/gcc-6.4.0/EQ-R -EQPy=$WORKFLOWS_ROOT/common/ext/EQ-Py +EQR=$ROOT/EQ-R + +# EQPy=$WORKFLOWS_ROOT/common/ext/EQ-Py # For test output processing: LOCAL=0 CRAY=1 - -# Resident task worker count and rank list -# If this is already set, we respect the user settings -# If this is unset, we set it to 1 -# and run the algorithm on the 2nd highest rank -# This value is only read in HPO workflows -if [[ ${TURBINE_RESIDENT_WORK_WORKERS:-} == "" ]] -then - export TURBINE_RESIDENT_WORK_WORKERS=1 - export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) -fi diff --git a/workflows/common/sh/env-theta.sh b/workflows/common/sh/env-theta.sh index c2362f1d..e3120a4d 100644 --- a/workflows/common/sh/env-theta.sh +++ b/workflows/common/sh/env-theta.sh @@ -1,6 +1,7 @@ -# LANGS Theta -# Language settings for Theta (Swift, Python, R, Tcl, etc.) +# ENV Theta + +# Environment settings for Theta (Swift, Python, R, Tcl, etc.) # TCL=/home/wozniak/Public/sfw/theta/tcl-8.6.1 # export R=/home/wozniak/Public/sfw/theta/R-3.4.0/lib64/R @@ -57,7 +58,7 @@ fi # Selects the *.swift files to include # If "app", use app functions # If "py", use in-memory Python functions -SWIFT_IMPL="app" +CANDLE_MODEL_IMPL="app" # Log settings to output echo "Programs:" diff --git a/workflows/common/sh/env-titan.sh b/workflows/common/sh/env-titan.sh index 2ac5f1b0..caa1047a 100644 --- a/workflows/common/sh/env-titan.sh +++ b/workflows/common/sh/env-titan.sh @@ -1,4 +1,4 @@ -SWIFT_IMPL=app +CANDLE_MODEL_IMPL=app export R=/ccs/proj/med106/gounley1/titan/R-3.2.1/lib64/R export PY=/sw/xk6/deeplearning/1.0/sles11.3_gnu4.9.3 export LD_LIBRARY_PATH=$PY/lib:$R/lib:$LD_LIBRARY_PATH diff --git a/workflows/common/sh/env-washington.sh b/workflows/common/sh/env-washington.sh index 56d87ace..02dcc2e9 100644 --- a/workflows/common/sh/env-washington.sh +++ b/workflows/common/sh/env-washington.sh @@ -19,8 +19,8 @@ PATH=$R/bin:$PATH # Swift/T export PATH=/homes/wozniak/Public/sfw/swift-t/2019-05-23/stc/bin:$PATH -# SWIFT_IMPL="app" # use this one for real runs -SWIFT_IMPL="echo" # use this one to debug the model.sh command line +# CANDLE_MODEL_IMPL="app" # use this one for real runs +CANDLE_MODEL_IMPL="echo" # use this one to debug the model.sh command line # EMEWS Queues for R # EQR=/opt/EQ-R diff --git a/workflows/common/sh/get-last-experiment.zsh b/workflows/common/sh/get-last-experiment.zsh new file mode 100644 index 00000000..6b3a3036 --- /dev/null +++ b/workflows/common/sh/get-last-experiment.zsh @@ -0,0 +1,19 @@ + +# GET LAST EXPERIMENT +# A couple handy interactive functions + +D() +# Find the latest experiment directory, assign to environment variable D +{ + D=( experiments/*(om[1]) ) ; d D + local _D + _D=$D + unset D + export D=$_D +} + +E() +# Inspect the outputs in $D +{ + e $D/output.txt $D/out/out-*.txt +} diff --git a/workflows/common/sh/langs-app-biowulf.sh b/workflows/common/sh/langs-app-biowulf.sh index 869b60af..c844f1c6 100644 --- a/workflows/common/sh/langs-app-biowulf.sh +++ b/workflows/common/sh/langs-app-biowulf.sh @@ -1,9 +1,9 @@ -# LANGS APP Singularity on Biowulf -# Language settings for singularity app functions (Python, R, etc.) +# LANGS APP Biowulf +# Language settings for app functions (Python, R, etc.) # Load the environment in which CANDLE was built -module load $DEFAULT_PYTHON_MODULE +module load "$CANDLE_DEFAULT_PYTHON_MODULE" #module load openmpi/3.1.2/cuda-9.0/gcc-7.3.0-pmi2 cuDNN/7.1/CUDA-9.0 CUDA/9.0 #source /data/$USER/conda/etc/profile.d/conda.sh diff --git a/workflows/common/sh/langs-app-frontier.sh b/workflows/common/sh/langs-app-frontier.sh new file mode 100644 index 00000000..f6e0d867 --- /dev/null +++ b/workflows/common/sh/langs-app-frontier.sh @@ -0,0 +1,13 @@ + +# LANGS APP FRONTIER SH + +# Allow for user PYTHONPATH additions: +APP_PYTHONPATH=${APP_PYTHONPATH:-} + +# Overwrite anything else set by the system or Swift/T environment: +# export PY=/gpfs/alpine/med106/proj-shared/hm0/candle_tf_frontier +export PY=/gpfs/alpine/med106/proj-shared/hm0/candle_tf_2.10 +export LD_LIBRARY_PATH=$PY/lib +export PYTHONHOME=$PY +export PATH=$PYTHONHOME/bin:$PATH +export PYTHONPATH=$PYTHONHOME/lib/python3.9:$PYTHONHOME/lib/python3.9/site-packages:$APP_PYTHONPATH diff --git a/workflows/common/sh/langs-app-gce.sh b/workflows/common/sh/langs-app-gce.sh new file mode 100644 index 00000000..7bcada0a --- /dev/null +++ b/workflows/common/sh/langs-app-gce.sh @@ -0,0 +1,9 @@ + +# LANGS APP GCE + +PATH=/nfs/gce/globalscratch/jain/conda_installs/bin:$PATH + +echo "langs-app-gce: using python:" +which python + +export PYTHONPATH=${APP_PYTHONPATH:-} diff --git a/workflows/common/sh/langs-app-lambda.sh b/workflows/common/sh/langs-app-lambda.sh new file mode 100644 index 00000000..51bb3390 --- /dev/null +++ b/workflows/common/sh/langs-app-lambda.sh @@ -0,0 +1,12 @@ + +# LANGS APP LAMBDA + +echo "langs-app-lambda ..." + +SFW=/home/woz/Public/sfw + +PY=$SFW/Anaconda + +PATH=$PY/bin:$PATH + +echo "langs-app-lambda done." diff --git a/workflows/common/sh/langs-app-local.sh b/workflows/common/sh/langs-app-local.sh index c98eaaf3..855a25b7 100644 --- a/workflows/common/sh/langs-app-local.sh +++ b/workflows/common/sh/langs-app-local.sh @@ -1,16 +1,14 @@ # LANGS APP LOCAL -PYTHONHOME=${PYTHONHOME:-"/usr/"} -export PYTHONHOME -PYTHON=${PYTHON:-python} -export LD_LIBRARY_PATH="$PYTHONHOME/lib":$LD_LIBRARY_PATH +export PYTHONHOME="$HOME/anaconda3" +PYTHON="$PYTHONHOME/bin/" +export LD_LIBRARY_PATH="$PYTHONHOME/lib" export PATH="$PYTHONHOME/bin:$PATH" COMMON_DIR=$EMEWS_PROJECT_ROOT/../common/python -PYTHONPATH+=":$PYTHONHOME/lib/$PYTHON:" +PYTHONPATH+=":$PYTHONHOME/lib/:" PYTHONPATH+=":$COMMON_DIR:" -PYTHONPATH+="$PYTHONHOME/lib/$PYTHON/dist-packages" APP_PYTHONPATH=${APP_PYTHONPATH:-} PYTHONPATH+=":$APP_PYTHONPATH" diff --git a/workflows/common/sh/langs-app-mbook.sh b/workflows/common/sh/langs-app-mbook.sh new file mode 100644 index 00000000..98bbae08 --- /dev/null +++ b/workflows/common/sh/langs-app-mbook.sh @@ -0,0 +1,25 @@ + +# LANGS APP mbook + +echo "langs-app-mbook ..." + +PY=/opt/homebrew/anaconda3/envs/tensorflow/ + +PATH=$PY/bin:$PATH + + +export PYTHONHOME=$PY +PYTHON="$PYTHONHOME/bin/" +export LD_LIBRARY_PATH="$PYTHONHOME/lib" +# export PATH="$PYTHONHOME/bin:$PATH" + +COMMON_DIR=$EMEWS_PROJECT_ROOT/../common/python +PYTHONPATH+=":$PYTHONHOME/lib/:" +PYTHONPATH+=":$COMMON_DIR:" + +APP_PYTHONPATH=${APP_PYTHONPATH:-} +PYTHONPATH+=":$APP_PYTHONPATH" + +export PYTHONPATH + +echo "langs-app-mbook done." diff --git a/workflows/common/sh/langs-app-polaris.sh b/workflows/common/sh/langs-app-polaris.sh new file mode 100644 index 00000000..f83741b4 --- /dev/null +++ b/workflows/common/sh/langs-app-polaris.sh @@ -0,0 +1,11 @@ + +# LANGS APP Polaris + +PATH=/grand/CSC249ADOA01/public/sfw/polaris/Miniconda/bin:$PATH + +module load singularity + +export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 +export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 +export http_proxy=http://proxy.alcf.anl.gov:3128 +export https_proxy=http://proxy.alcf.anl.gov:3128 diff --git a/workflows/common/sh/langs-app-summit.sh b/workflows/common/sh/langs-app-summit.sh index 0f056ea8..7401008a 100644 --- a/workflows/common/sh/langs-app-summit.sh +++ b/workflows/common/sh/langs-app-summit.sh @@ -1,17 +1,12 @@ -# LANGS APP SUMMIT SH - -# WIP 2019-02-28 -APP_PYTHONPATH=${APP_PYTHONPATH:-$PYTHONPATH} +# LANGS APP SUMMIT SH -# Clear anything set by the system or Swift/T environment -unset PYTHONPATH -unset LD_LIBRARY_PATH +# Allow for user PYTHONPATH additions: +APP_PYTHONPATH=${APP_PYTHONPATH:-} -# ROOT=/ccs/proj/med106/gounley1/summit -ROOT=/ccs/proj/med106/hsyoo/summit -export PY=$ROOT/conda36 -export LD_LIBRARY_PATH=/sw/summit/cuda/10.1.168/lib64:/sw/summit/gcc/4.8.5/lib64:$PY/lib -export PYTHONHOME=$ROOT/conda36 +# Overwrite anything else set by the system or Swift/T environment: +export PY=/gpfs/alpine/world-shared/med106/sw/open-ce-1.1.3-py37 +export LD_LIBRARY_PATH=$PY/lib +export PYTHONHOME=$PY export PATH=$PYTHONHOME/bin:$PATH -export PYTHONPATH=$PYTHONHOME/lib/python3.6:$PYTHONHOME/lib/python3.6/site-packages:$APP_PYTHONPATH +export PYTHONPATH=$PYTHONHOME/lib/python3.9:$PYTHONHOME/lib/python3.9/site-packages:$APP_PYTHONPATH diff --git a/workflows/common/sh/model.sh b/workflows/common/sh/model.sh index cd82bba6..26744761 100644 --- a/workflows/common/sh/model.sh +++ b/workflows/common/sh/model.sh @@ -3,50 +3,92 @@ set -eu # MODEL.SH -# Shell wrapper around Keras model +# Supervisor shell wrapper around CANDLE model +# Used for CANDLE_MODEL_IMPL types: "app" and "container" + +# Note that APP_PYTHONPATH is used by models here and +# not just PYTHONPATH + +# Note: Under Swift/T, the initial output from here will go +# to the main Swift/T stdout and be mixed with output from +# other models. +# Thus, we redirect to a separate model.log file for each model run +# and normally we do not produce output until after the redirection. usage() { - echo "Usage: model.sh FRAMEWORK PARAMS RUNID" + echo "Usage: model.sh FRAMEWORK PARAMS EXPID RUNID MODEL_TYPE MODEL_NAME MODEL_ACTION" + echo "MODEL_TYPE is BENCHMARK or SINGULARITY" + echo "MODEL_NAME is the CANDLE Benchmark name (e.g., 'uno')" + echo " or a /path/to/image.sif" + echo "MODEL_ACTION is unused for a Benchmark," + echo " for Singularity it is a script (e.g., 'ACTION.sh')" echo "The environment should have:" - echo " EMEWS_PROJECT_ROOT|WORKFLOWS_ROOT TURBINE_OUTPUT" - echo " SITE OBJ_RETURN BENCHMARK_TIMEOUT" - echo " and MODEL_NAME EXPID for model_runner.py" - echo "If SH_TIMEOUT is provided, we run under the shell command timeout" + echo " EMEWS_PROJECT_ROOT|WORKFLOWS_ROOT TURBINE_OUTPUT" + echo " SITE MODEL_RETURN BENCHMARK_TIMEOUT" + echo " CANDLE_DATA_DIR" + echo "If SH_TIMEOUT is set, we run under the shell command timeout" } -if (( ${#} != 3 )) +if (( ${#} != 7 )) then + echo + echo "model.sh: Wrong number of arguments: received ${#} , required: 7" + echo usage exit 1 fi -FRAMEWORK=$1 # Usually "keras" -# JSON string of parameters +FRAMEWORK=$1 # Usually "keras" or "pytorch" +# JSON string of parameters: PARAMS="$2" -RUNID=$3 +export EXPID=$3 +export RUNID=$4 +export MODEL_TYPE=$5 +export MODEL_NAME=$6 +export MODEL_ACTION=$7 # Each model run, runs in its own "instance" directory # Set instance_directory to that and cd into it. -INSTANCE_DIRECTORY=$TURBINE_OUTPUT/run/$RUNID - -SH_TIMEOUT=${SH_TIMEOUT:-} -TIMEOUT_CMD="" -if [[ -n "$SH_TIMEOUT" ]] && [[ $SH_TIMEOUT != "-1" ]] +# # TODO: rename INSTANCE_DIRECTORY to OUTPUT_DIR +#set -x +if [[ $MODEL_TYPE = "SINGULARITY" ]] then - TIMEOUT_CMD="timeout $SH_TIMEOUT" + # TODO: Rename "instance" to "run" + MODEL_TOKEN=$( basename $MODEL_NAME .sif ) + INSTANCE_DIRECTORY=$CANDLE_DATA_DIR/$MODEL_TOKEN/Output/$EXPID/$RUNID + INTERNAL_DIRECTORY=$MODEL_NAME/Output/$EXPID/$RUNID +else # "BENCHMARKS" + INSTANCE_DIRECTORY=$TURBINE_OUTPUT/$RUNID + export CANDLE_OUTPUT_DIR=$INSTANCE_DIRECTORY fi # All stdout/stderr after this point goes into model.log ! -mkdir -p $INSTANCE_DIRECTORY +mkdir -pv $INSTANCE_DIRECTORY LOG_FILE=$INSTANCE_DIRECTORY/model.log +echo "redirecting to: LOG_FILE=$INSTANCE_DIRECTORY/model.log" +set +x exec >> $LOG_FILE exec 2>&1 cd $INSTANCE_DIRECTORY -echo "MODEL.SH START:" -echo "MODEL_NAME: $MODEL_NAME" -echo "RUNID: $RUNID" +TIMEOUT_CMD="" +if [[ ${SH_TIMEOUT:-} != "" ]] && [[ $SH_TIMEOUT != "-1" ]] +then + TIMEOUT_CMD="timeout $SH_TIMEOUT" +fi + +log() +{ + echo $( date "+%Y-%m-%d %H:%M:%S" ) "MODEL.SH:" $* +} + +log "START" +log "MODEL_NAME: $MODEL_NAME" +log "RUNID: $RUNID" +log "HOST: $( hostname )" +log "ADLB_RANK_OFFSET: $ADLB_RANK_OFFSET" +log "MODEL_TYPE: $MODEL_TYPE" # Source langs-app-{SITE} from workflow/common/sh/ (cf. utils.sh) if [[ ${WORKFLOWS_ROOT:-} == "" ]] @@ -57,50 +99,107 @@ source $WORKFLOWS_ROOT/common/sh/utils.sh source_site langs-app $SITE echo -echo PARAMS: +log "PARAMS:" echo $PARAMS | print_json echo -echo "MODEL.SH: USING PYTHON:" -which python +log "USING PYTHON:" $( which python3 ) +echo + +# Cf. utils.sh +log_path APP_PYTHONPATH +log_path PYTHONPATH +log_path LD_LIBRARY_PATH +show PYTHONHOME + +# Set up PYTHONPATH for app tasks +export PYTHONPATH=${APP_PYTHONPATH:-}:${PYTHONPATH:-} + +# Construct the desired model command MODEL_CMD based on MODEL_TYPE: +if [[ ${MODEL_TYPE:-} == "SINGULARITY" ]] +then + + # No model_runner, need to write parameters.txt explicitly: + # get hyper_parameter_map to pass as 2nd argument + + FLAGS=$( python3 $WORKFLOWS_ROOT/common/python/runner_utils.py expand_params \ + "$PARAMS" ) + + # Remove --candle image flag and the second argument, assume it is the last argument + export FLAGS="${FLAGS/ --candle_image*/}" + + # The Singularity command line arguments: + MODEL_CMD=( singularity exec --nv + --bind $CANDLE_DATA_DIR:/candle_data_dir + $MODEL_NAME ${MODEL_ACTION}.sh $ADLB_RANK_OFFSET + /candle_data_dir + $FLAGS # $INTERNAL_DIRECTORY/parameters.txt + --experiment_id $EXPID + --run_id $RUNID + ) + +else # "BENCHMARKS" + + # The Python command line arguments: + PY_CMD=( "$WORKFLOWS_ROOT/common/python/model_runner.py" + "$PARAMS" + "$INSTANCE_DIRECTORY" + "$FRAMEWORK" + "$RUNID" + "$BENCHMARK_TIMEOUT" ) + + MODEL_CMD=( python3 -u "${PY_CMD[@]}" ) + # model_runner/runner_utils writes result.txt +fi + +echo +log "MODEL_CMD: ${MODEL_CMD[@]}" echo -arg_array=( "$WORKFLOWS_ROOT/common/python/model_runner.py" - "$PARAMS" - "$INSTANCE_DIRECTORY" - "$FRAMEWORK" - "$RUNID" - "$BENCHMARK_TIMEOUT") -MODEL_CMD="python3 -u ${arg_array[@]}" -# echo MODEL_CMD: $MODEL_CMD -if $TIMEOUT_CMD python3 -u "${arg_array[@]}" +# Run Python! +$TIMEOUT_CMD "${MODEL_CMD[@]}" & +PID=$! + +# Use if block to suppress errors: +if wait $PID then - : # Assume success so we can keep a failed exit code + CODE=0 else - # $? is the exit status of the most recently executed command - # (i.e the line in the 'if' condition) CODE=$? +fi + +log "$MODEL_TYPE: EXIT CODE: $CODE" +if (( CODE == 0 )) +then + ls -ltrh + sleep 1 # Wait for initial output + # Get last results of the format "IMPROVE RESULT xxx" in model.log + # NOTE: Enabling set -x will break the following (token CANDLE_RESULT) + RES=$( awk -v FS="IMPROVE_RESULT" 'NF>1 {x=$2} END {print x}' \ + $INSTANCE_DIRECTORY/model.log ) + RESULT="$(echo $RES | grep -Eo '[+-]?[0-9]+([.][0-9]+)?')" || true + echo "IMPROVE RESULT: '$RESULT'" + echo $RESULT > $INSTANCE_DIRECTORY/result.txt +else echo # spacer - if [ $CODE == 124 ] + if (( $CODE == 124 )) then - echo "MODEL.SH: Timeout error in $MODEL_CMD" - # This will trigger a NaN (the result file does not exist) - exit 0 + log "TIMEOUT ERROR! (timeout=$SH_TIMEOUT)" else - echo "MODEL.SH: Error (CODE=$CODE) in $MODEL_CMD" - echo "MODEL.SH: TIMESTAMP:" $( date "+%Y-%m-%d %H:%M:%S" ) - if (( ${IGNORE_ERRORS:-0} )) - then - echo "MODEL.SH: IGNORING ERROR." - # This will trigger a NaN (the result file does not exist) - exit 0 - fi - echo "MODEL.SH: ABORTING WORKFLOW (exit 1)" - exit 1 # Unknown error in Python: abort the workflow + log "MODEL ERROR! (CODE=$CODE)" + fi + if (( ${IGNORE_ERRORS:-0} == 0 )) + then + # Unknown error in Python: abort the workflow + log "ABORTING WORKFLOW (exit 1)" + exit 1 fi + # This will trigger a NaN (the result file does not exist) + log "IGNORING ERROR." fi -echo "MODEL.SH END: SUCCESS" +log "END: SUCCESS" + exit 0 # Success # Local Variables: diff --git a/workflows/common/sh/model_abstention.sh b/workflows/common/sh/model_abstention.sh new file mode 100644 index 00000000..73b025bc --- /dev/null +++ b/workflows/common/sh/model_abstention.sh @@ -0,0 +1,108 @@ +#!/bin/bash +set -eu + +# MODEL.SH + +# Shell wrapper around Keras model + +usage() +{ + echo "Usage: model.sh FRAMEWORK PARAMS RUNID" + echo "The environment should have:" + echo " EMEWS_PROJECT_ROOT|WORKFLOWS_ROOT TURBINE_OUTPUT" + echo " SITE OBJ_RETURN BENCHMARK_TIMEOUT" + echo " and MODEL_NAME EXPID for model_runner.py" + echo "If SH_TIMEOUT is provided, we run under the shell command timeout" +} + +if (( ${#} != 3 )) +then + usage + exit 1 +fi + +FRAMEWORK=$1 # Usually "keras" +# JSON string of parameters +PARAMS="$2" +RUNID=$3 + +# Each model run, runs in its own "instance" directory +# Set instance_directory to that and cd into it. +INSTANCE_DIRECTORY=$TURBINE_OUTPUT/run/$RUNID + +SH_TIMEOUT=${SH_TIMEOUT:-} +TIMEOUT_CMD="" +if [[ -n "$SH_TIMEOUT" ]] && [[ $SH_TIMEOUT != "-1" ]] +then + TIMEOUT_CMD="timeout $SH_TIMEOUT" +fi + +# All stdout/stderr after this point goes into model.log ! +mkdir -p $INSTANCE_DIRECTORY +LOG_FILE=$INSTANCE_DIRECTORY/model.log +exec >> $LOG_FILE +exec 2>&1 +cd $INSTANCE_DIRECTORY + +echo "MODEL.SH START:" +echo "MODEL_NAME: $MODEL_NAME" +echo "RUNID: $RUNID" + +# Source langs-app-{SITE} from workflow/common/sh/ (cf. utils.sh) +if [[ ${WORKFLOWS_ROOT:-} == "" ]] +then + WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. ; /bin/pwd ) +fi +source $WORKFLOWS_ROOT/common/sh/utils.sh +source_site langs-app $SITE + +echo +echo PARAMS: +echo $PARAMS | print_json + +echo +echo "MODEL.SH: USING PYTHON:" +which python +echo + +arg_array=( "$WORKFLOWS_ROOT/common/python/model_abstention_runner.py" + "$PARAMS" + "$INSTANCE_DIRECTORY" + "$FRAMEWORK" + "$RUNID" + "$BENCHMARK_TIMEOUT") +MODEL_CMD="python3 -u ${arg_array[@]}" +# echo MODEL_CMD: $MODEL_CMD +if $TIMEOUT_CMD python3 -u "${arg_array[@]}" +then + : # Assume success so we can keep a failed exit code +else + # $? is the exit status of the most recently executed command + # (i.e the line in the 'if' condition) + CODE=$? + echo # spacer + if [ $CODE == 124 ] + then + echo "MODEL.SH: Timeout error in $MODEL_CMD" + # This will trigger a NaN (the result file does not exist) + exit 0 + else + echo "MODEL.SH: Error (CODE=$CODE) in $MODEL_CMD" + echo "MODEL.SH: TIMESTAMP:" $( date "+%Y-%m-%d %H:%M:%S" ) + if (( ${IGNORE_ERRORS:-0} )) + then + echo "MODEL.SH: IGNORING ERROR." + # This will trigger a NaN (the result file does not exist) + exit 0 + fi + echo "MODEL.SH: ABORTING WORKFLOW (exit 1)" + exit 1 # Unknown error in Python: abort the workflow + fi +fi + +echo "MODEL.SH END: SUCCESS" +exit 0 # Success + +# Local Variables: +# sh-basic-offset: 2 +# End: diff --git a/workflows/common/sh/run_logger.sh b/workflows/common/sh/run_logger.sh index e0dd2458..40b2d700 100644 --- a/workflows/common/sh/run_logger.sh +++ b/workflows/common/sh/run_logger.sh @@ -17,7 +17,7 @@ fi # "start" propose_points, max_iterations, ps, algorithm, exp_id, sys_env if [ $CMD == "start" ] - then + then SITE=$9 source $WORKFLOWS_ROOT/common/sh/utils.sh source_site langs-app $SITE diff --git a/workflows/common/sh/sched-crusher.sh b/workflows/common/sh/sched-crusher.sh new file mode 100644 index 00000000..02e933df --- /dev/null +++ b/workflows/common/sh/sched-crusher.sh @@ -0,0 +1,10 @@ + +# SCHED Crusher + +# Tell Swift/T to use SLURM: +MACHINE="-m slurm" +export TURBINE_LAUNCHER=srun + +# Default CANDLE account settings for Spock: +export PROJECT=${PROJECT:-MED106_crusher} +export QUEUE=${QUEUE:-batch} diff --git a/workflows/common/sh/sched-frontier.sh b/workflows/common/sh/sched-frontier.sh new file mode 100644 index 00000000..cea9d7a9 --- /dev/null +++ b/workflows/common/sh/sched-frontier.sh @@ -0,0 +1,18 @@ + +# SCHED Frontier + +# Scheduler settings for Swift/T/SLURM/Frontier + +MACHINE="-m slurm" + +# Default PROJECT for CANDLE +#export QUEUE=${QUEUE:-batch} +export PROJECT=${PROJECT:-MED106} + +# PY=/gpfs/alpine/med106/proj-shared/hm0/candle_tf_2.10 +PY=/lustre/orion/world-shared/med106/gounley1/conda543 +export TURBINE_PRELAUNCH="source activate $PY" + +export TURBINE_DIRECTIVE="#SBATCH -C nvme" + +export TURBINE_LAUNCH_OPTIONS="--gpus-per-task=1 --gpus-per-node=$PPN" diff --git a/workflows/common/sh/sched-gce.sh b/workflows/common/sh/sched-gce.sh new file mode 100644 index 00000000..9db8ac81 --- /dev/null +++ b/workflows/common/sh/sched-gce.sh @@ -0,0 +1,4 @@ + +# SCHED GCE + +# Nothing: Unscheduled mpiexec execution diff --git a/workflows/common/sh/sched-lambda.sh b/workflows/common/sh/sched-lambda.sh new file mode 100644 index 00000000..779ed610 --- /dev/null +++ b/workflows/common/sh/sched-lambda.sh @@ -0,0 +1,4 @@ + +# SCHED LAMBDA + +# Empty- Lambda uses normal unscheduled mpiexec execution in Swift/T diff --git a/workflows/common/sh/sched-lambda7.sh b/workflows/common/sh/sched-lambda7.sh new file mode 100644 index 00000000..d3a21667 --- /dev/null +++ b/workflows/common/sh/sched-lambda7.sh @@ -0,0 +1,4 @@ + +# SCHED LAMBDA7 + +# Empty- Lambda uses normal unscheduled mpiexec execution in Swift/T diff --git a/workflows/common/sh/sched-local-as.sh b/workflows/common/sh/sched-local-as.sh index c5625211..25294442 100644 --- a/workflows/common/sh/sched-local-as.sh +++ b/workflows/common/sh/sched-local-as.sh @@ -8,4 +8,4 @@ MACHINE="" export LOCAL true # Default PROJECT for CANDLE -export PROJECT=NONE \ No newline at end of file +export PROJECT=NONE diff --git a/workflows/common/sh/sched-local.sh b/workflows/common/sh/sched-local.sh index c5625211..25294442 100644 --- a/workflows/common/sh/sched-local.sh +++ b/workflows/common/sh/sched-local.sh @@ -8,4 +8,4 @@ MACHINE="" export LOCAL true # Default PROJECT for CANDLE -export PROJECT=NONE \ No newline at end of file +export PROJECT=NONE diff --git a/workflows/common/sh/sched-mcs.sh b/workflows/common/sh/sched-mcs.sh index 45339c66..0f355c8e 100755 --- a/workflows/common/sh/sched-mcs.sh +++ b/workflows/common/sh/sched-mcs.sh @@ -1,5 +1,5 @@ -# SCHED LOCAL +# SCHED MCS # Scheduler settings for Swift/MCS MACHINE="" diff --git a/workflows/common/sh/sched-polaris.sh b/workflows/common/sh/sched-polaris.sh new file mode 100644 index 00000000..557a6015 --- /dev/null +++ b/workflows/common/sh/sched-polaris.sh @@ -0,0 +1,18 @@ + +# SCHED Polaris + +# Scheduler settings for Swift/T/PBS/Polaris + +MACHINE="-m pbs" + +# Default PROJECT for CANDLE +export PROJECT=${PROJECT:-CSC249ADOA01} + +export QUEUE=${QUEUE:-debug} +export WALLTIME=${WALLTIME:-00:10:00} + +# These are Polaris-specific settings - see: +# https://www.alcf.anl.gov/support/user-guides/polaris/hardware-overview/machine-overview +# http://swift-lang.github.io/swift-t/sites.html#_polaris +export TURBINE_POLARIS=1 +export TURBINE_DIRECTIVE='#PBS -l filesystems=home:grand' diff --git a/workflows/common/sh/sched-spock.sh b/workflows/common/sh/sched-spock.sh new file mode 100644 index 00000000..32e659e2 --- /dev/null +++ b/workflows/common/sh/sched-spock.sh @@ -0,0 +1,10 @@ + +# SCHED Spock + +# Tell Swift/T to use SLURM: +MACHINE="-m slurm" +export TURBINE_LAUNCHER=srun + +# Default CANDLE account settings for Spock: +export PROJECT=${PROJECT:-MED106} +export QUEUE=${QUEUE:-ecp} diff --git a/workflows/common/sh/sched-summit-i.sh b/workflows/common/sh/sched-summit-i.sh new file mode 100644 index 00000000..2ecbc13f --- /dev/null +++ b/workflows/common/sh/sched-summit-i.sh @@ -0,0 +1,7 @@ + +# SCHED Summit Interactive +# Scheduler settings for Swift/Summit + +MACHINE="" + +# export TURBINE_OUTPUT_SOFTLINK=/dev/null diff --git a/workflows/common/sh/sched-summit-tf-2.4.1.sh b/workflows/common/sh/sched-summit-tf-2.4.1.sh new file mode 100644 index 00000000..ba482cec --- /dev/null +++ b/workflows/common/sh/sched-summit-tf-2.4.1.sh @@ -0,0 +1,19 @@ + +# SCHED Summit TF 2.4.1 +# Scheduler settings for Swift/Summit + +if (( ${INTERACTIVE:-0} )) +then + # Interactive settings + MACHINE="" + export TURBINE_LAUNCHER=jsrun +else + # Use LSF: + MACHINE="-m lsf" +fi + +# Default PROJECT for CANDLE +#export QUEUE=${QUEUE:-batch-hm} +export PROJECT=${PROJECT:-MED106} + +# export TURBINE_OUTPUT_SOFTLINK=/dev/null diff --git a/workflows/common/sh/sched-summit-tf1.sh b/workflows/common/sh/sched-summit-tf1.sh new file mode 100644 index 00000000..726bae2f --- /dev/null +++ b/workflows/common/sh/sched-summit-tf1.sh @@ -0,0 +1,19 @@ + +# SCHED Summit TF1 +# Scheduler settings for Swift/Summit + +if (( ${INTERACTIVE:-0} )) +then + # Interactive settings + MACHINE="" + export TURBINE_LAUNCHER=jsrun +else + # Use LSF: + MACHINE="-m lsf" +fi + +# Default PROJECT for CANDLE +#export QUEUE=${QUEUE:-batch-hm} +export PROJECT=${PROJECT:-MED106} + +# export TURBINE_OUTPUT_SOFTLINK=/dev/null diff --git a/workflows/common/sh/sched-summit-tf2.sh b/workflows/common/sh/sched-summit-tf2.sh new file mode 100644 index 00000000..48038d49 --- /dev/null +++ b/workflows/common/sh/sched-summit-tf2.sh @@ -0,0 +1,19 @@ + +# SCHED Summit TF2 +# Scheduler settings for Swift/Summit + +if (( ${INTERACTIVE:-0} )) +then + # Interactive settings + MACHINE="" + export TURBINE_LAUNCHER=jsrun +else + # Use LSF: + MACHINE="-m lsf" +fi + +# Default PROJECT for CANDLE +#export QUEUE=${QUEUE:-batch-hm} +export PROJECT=${PROJECT:-MED106} + +# export TURBINE_OUTPUT_SOFTLINK=/dev/null diff --git a/workflows/common/sh/sched-summit.sh b/workflows/common/sh/sched-summit.sh index 9462913d..ee173969 100644 --- a/workflows/common/sh/sched-summit.sh +++ b/workflows/common/sh/sched-summit.sh @@ -1,10 +1,24 @@ # SCHED Summit -# Scheduler settings for Swift/Summit + +# Scheduler settings for Swift/T/LSF/Summit MACHINE="-m lsf" # Default PROJECT for CANDLE +#export QUEUE=${QUEUE:-batch-hm} export PROJECT=${PROJECT:-MED106} # export TURBINE_OUTPUT_SOFTLINK=/dev/null + +JSRUN_DEFAULT="-a1 -g6 -c7" + +if (( PPN == 1 )) +then + export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" +else + # For PPN=4 debugging: + export TURBINE_LAUNCH_OPTIONS="-g1 -c7 -a1" +fi + +export TURBINE_DIRECTIVE="#BSUB -alloc_flags \"NVME maximizegpfs\"" diff --git a/workflows/common/sh/set-pythonpath.sh b/workflows/common/sh/set-pythonpath.sh new file mode 100644 index 00000000..b2a028b8 --- /dev/null +++ b/workflows/common/sh/set-pythonpath.sh @@ -0,0 +1,56 @@ + +# SET PYTHONPATH SH +# Sets up BENCHMARKS_ROOT variable and PYTHONPATH for workflows +# For CANDLE models, BENCHMARKS_ROOT is the CANDLE Benchmarks repo +# EMEWS_PROJECT_ROOT should be set by the calling script +# User may set BENCHMARKS_ROOT to override defaults +# BENCHMARKS_ROOT must exist as directory, +# although it may be empty/unused +# Repo structure is Supervisor/workflows/PROJECT , +# with Benchmarks normally alongside Supervisor +# If MODEL_PYTHON_DIR is set, that is added to PYTHONPATH + +SUPERVISOR=$( cd $EMEWS_PROJECT_ROOT/../.. ; /bin/pwd ) + +# Set up Supervisor +export PYTHONPATH +PYTHONPATH+=:$SUPERVISOR/workflows/common/python +PYTHONPATH+=:$SUPERVISOR/models/OneD +PYTHONPATH+=:$SUPERVISOR/models/Random +PYTHONPATH+=:$SUPERVISOR/workflows/common/ext/EQ-Py + +# The remainder of this script sets up PYTHONPATHs +# for the CANDLE Benchmarks if they are found +if ! [[ -d $SUPERVISOR/../Benchmarks ]] +then + # The user must be running an external model or container + return +fi +BENCHMARKS_DEFAULT=$( cd $SUPERVISOR/../Benchmarks ; /bin/pwd ) +export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} + +if [[ ! -d $BENCHMARKS_ROOT ]] +then + echo "Could not find BENCHMARKS_ROOT: '$BENCHMARKS_ROOT'" + return 1 +fi + +# This is now in candle_lib, which should be installed/available +# in the common compute-node Python environment: 2022-12-20 +# APP_PYTHONPATH+=:$BENCHMARK_DIRS:$BENCHMARKS_ROOT/common +# PYTHONPATH+=:$BENCHMARK_DIRS:$BENCHMARKS_ROOT/common + +# Add known CANDLE Benchmarks to PYTHONPATH +PYTHONPATH+=:$BENCHMARKS_ROOT/Pilot1/P1B1 +PYTHONPATH+=:$BENCHMARKS_ROOT/Pilot1/Attn1 +PYTHONPATH+=:$BENCHMARKS_ROOT/Pilot1/NT3 +PYTHONPATH+=:$BENCHMARKS_ROOT/Pilot1/Uno +PYTHONPATH+=:$BENCHMARKS_ROOT/examples/ADRP +PYTHONPATH+=:$BENCHMARKS_ROOT/examples/xform-smiles + +export APP_PYTHONPATH=${APP_PYTHONPATH:-$PYTHONPATH} + +if [[ ${MODEL_PYTHON_DIR:-} != "" ]] +then + PYTHONPATH+=:$MODEL_PYTHON_DIR +fi diff --git a/workflows/common/sh/utils.sh b/workflows/common/sh/utils.sh index 9b3f751a..d61a48c0 100644 --- a/workflows/common/sh/utils.sh +++ b/workflows/common/sh/utils.sh @@ -61,11 +61,19 @@ show() } log_path() -# Pretty print a colon-separated variable +# Pretty print a colon-separated variable, one entry per line # Provide the name of the variable (no dollar sign) { - echo ${1}: - eval echo \$$1 | tr : '\n' | nl + # First, test if $1 is the name of a set shell variable: + if eval test \$\{$1:-\} + then + echo ${1}: + eval echo \$$1 | tr : '\n' | nl + echo -- + echo + else + echo "log_path(): ${1} is unset." + fi } which_check() @@ -90,9 +98,16 @@ python_envs() RESULT=() if [[ ${PYTHONPATH:-} != "" ]] then - # We do not currently need this- + # We do not currently need this except on MCS and Spock: # Swift/T should grab PYTHONPATH automatically - : # RESULT+=( -e PYTHONPATH=$PYTHONPATH ) + if [[ ${SITE} == "mcs" ]] || \ + [[ ${SITE} == "spock" ]] || \ + [[ ${SITE} == "lambda" ]] || \ + [[ ${SITE} == "frontier" ]] + then + # MCS discards PYTHONPATH in subshells + RESULT+=( -e PYTHONPATH=$PYTHONPATH ) + fi fi if [[ ${PYTHONHOME:-} != "" ]] then @@ -122,6 +137,7 @@ get_site() export SITE=$1 } + check_experiment() { if [[ -d $TURBINE_OUTPUT ]]; then while true; do @@ -138,76 +154,85 @@ check_experiment() { get_expid() # Get Experiment IDentifier -# EXPID is the name of the new directory under experiments/ -# If the user provides -a, this function will autogenerate -# a new EXPID under the experiments directory, -# If EXP_SUFFIX is set in the environment, the resulting -# EXPID will have that suffix. +# EXPID: The name of the new directory under experiments/ +# If the user provides -a, this function will autogenerate +# a new EXPID under the experiments directory, +# If EXP_SUFFIX is set in the environment, the resulting +# EXPID will have that suffix. +# CANDLE_MODEL_TYPE: "BENCHMARKS" or "SINGULARITY" +# Defaults to "BENCHMARKS" +# This variable affects the experiment directory structure # RETURN VALUES: EXPID and TURBINE_OUTPUT are exported into the environment # TURBINE_OUTPUT is canonicalized, because it may be soft-linked # to another filesystem (e.g., on Summit), and must be accessible # from the compute nodes without accessing the soft-links { - if (( ${#} < 1 )) + if (( ${#} != 1 )) then - echo "get_expid(): could not find EXPID argument!" + echo "get_expid(): provide EXPID or '-a'" return 1 fi - EXPERIMENTS=${EXPERIMENTS:-$EMEWS_PROJECT_ROOT/experiments} - export EXPID=$1 + : ${CANDLE_MODEL_TYPE:=BENCHMARKS} ${MODEL_NAME:=cmp} + echo "get_expid(): CANDLE_MODEL_TYPE=$CANDLE_MODEL_TYPE" + echo "get_expid(): MODEL_NAME=$MODEL_NAME" + + export EXPERIMENTS="" + + if [[ $CANDLE_MODEL_TYPE == "SINGULARITY" ]] + then + # Keep this directory in sync with model.sh RUN_DIRECTORY + MODEL_TOKEN=$( basename $MODEL_NAME .sif ) + EXPERIMENTS=$CANDLE_DATA_DIR/$MODEL_TOKEN/Output + else # "BENCHMARKS" + EXPERIMENTS=${EXPERIMENTS:-$EMEWS_PROJECT_ROOT/experiments} + fi + local i=0 EXPS E TO - if [ $EXPID = "-a" ] + if [[ $EXPID == "-a" ]] then shift # Search for free experiment number - mkdir -pv $EXPERIMENTS + if ! mkdir -pv $EXPERIMENTS + then + echo "get_expid(): could not make experiments directory:" \ + $EXPERIMENTS + return 1 + fi EXPS=( $( ls $EXPERIMENTS ) ) if (( ${#EXPS[@]} != 0 )) then for E in ${EXPS[@]} do - EXPID=$( printf "X%03i" $i )${EXP_SUFFIX:-} + EXPID=$( printf "EXP%03i" $i )${EXP_SUFFIX:-} if [[ $E == $EXPID ]] then i=$(( i + 1 )) fi done fi - EXPID=$( printf "X%03i" $i )${EXP_SUFFIX:-} - export TURBINE_OUTPUT=$EXPERIMENTS/$EXPID + EXPID=$( printf "EXP%03i" $i )${EXP_SUFFIX:-} + TURBINE_OUTPUT=$EXPERIMENTS/$EXPID check_experiment else - export TURBINE_OUTPUT=$EXPERIMENTS/$EXPID + TURBINE_OUTPUT=$EXPERIMENTS/$EXPID fi mkdir -pv $TURBINE_OUTPUT TO=$( readlink --canonicalize $TURBINE_OUTPUT ) if [[ $TO == "" ]] then - echo "Could not canonicalize: $TURBINE_OUTPUT" + echo "get_expid(): could not canonicalize: $TURBINE_OUTPUT" exit 1 fi - TURBINE_OUTPUT=$TO - - # Andrew: Needed for functionality with George's restart.py script for UPF jobs - if [ -f metadata.json ]; then - mv metadata.json $TURBINE_OUTPUT - fi - - # Andrew: Copy the CANDLE input file to the current experiments directory for reference - if [ -n "${CANDLE_INPUT_FILE-}" ]; then - if [ -f "$CANDLE_INPUT_FILE" ]; then - cp "$CANDLE_INPUT_FILE" "$TURBINE_OUTPUT" - fi - fi - + export TURBINE_OUTPUT=$TO } next() # Obtain next available numbered file name matching pattern +# in global variable REPLY # E.g., 'next out-%02i' returns 'out-02' if out-00 and out-01 exist. { local PATTERN=$1 FILE="" i=0 @@ -348,22 +373,24 @@ queue_wait_site() SITE=$1 JOBID=$2 - if [[ $SITE == "cori" ]] + site2=$(echo $SITE | awk -v FS="-" '{print $1}') # ALW 2020-11-15: allow $SITEs to have hyphens in them as Justin implemented for Summit on 2020-10-29, e.g., summit-tf1 + + if [[ $site2 == "cori" ]] then queue_wait_slurm $JOBID - elif [[ $SITE == "theta" ]] + elif [[ $site2 == "theta" ]] then queue_wait_cobalt $JOBID - elif [[ $SITE == "titan" ]] - then - queue_wait_pbs $JOBID - elif [[ $SITE == "summit" ]] + elif [[ $site2 =~ summit* ]] then queue_wait_lsf $JOBID - elif [[ $SITE == "pascal" ]] + elif [[ $site2 == "spock" ]] then queue_wait_slurm $JOBID - elif [[ $SITE == "biowulf" ]] + elif [[ $site2 == "pascal" ]] + then + queue_wait_slurm $JOBID + elif [[ $site2 == "biowulf" ]] then queue_wait_slurm $JOBID else @@ -564,6 +591,13 @@ log_script() { echo "" >> $LOG_NAME echo "## SCRIPT ###" >> $LOG_NAME cat $EMEWS_PROJECT_ROOT/swift/$SCRIPT_NAME >> $LOG_NAME + + # Andrew: Copy the CANDLE input file to the current experiments directory for reference + if [ -n "${CANDLE_INPUT_FILE-}" ]; then + if [ -f "$CANDLE_INPUT_FILE" ]; then + cp "$CANDLE_INPUT_FILE" "$TURBINE_OUTPUT" + fi + fi } check_directory_exists() { @@ -585,7 +619,7 @@ pad_keys() { # Pad 1st tokens printf "%-15s " $1 shift - echo ${*} + echo $* } print_json() { diff --git a/workflows/common/swift/candle_utils.swift b/workflows/common/swift/candle_utils.swift index afaf494f..de021110 100644 --- a/workflows/common/swift/candle_utils.swift +++ b/workflows/common/swift/candle_utils.swift @@ -9,7 +9,7 @@ puts "" puts "report_env() ..." puts "" global env -# puts [ array names env ] +# puts [ array names env ] puts "TURBINE_HOME: $env(TURBINE_HOME)" puts "" set tokens [ split $env(PATH) ":" ] @@ -17,10 +17,13 @@ foreach token $tokens { puts "PATH: $token" } puts "" -set tokens [ split $env(LD_LIBRARY_PATH) ":" ] -foreach token $tokens { - puts "LLP: $token" +if [ info exists env(LD_LIBRARY_PATH) ] { + set tokens [ split $env(LD_LIBRARY_PATH) ":" ] + foreach token $tokens { + puts "LLP: $token" + } } +puts "" if [ info exists env(PYTHONHOME) ] { puts "" puts "PYTHONHOME: $env(PYTHONHOME)" diff --git a/workflows/common/swift/model_app.swift b/workflows/common/swift/model_app.swift new file mode 100644 index 00000000..896ec626 --- /dev/null +++ b/workflows/common/swift/model_app.swift @@ -0,0 +1,64 @@ + +/** + CANDLE MODEL: APP + Runs CANDLE models as Swift/T app functions +*/ + +/** + The main objective function used by the CANDLE/Supervisor + model exploration (optimization) loop. + params : The JSON string of params to be passed to the Benchmark + expid : A string experiment ID that will be in the output directory name + runid : A string run ID that will be in the output directory name + model_name : Benchmark (e.g., "uno") +*/ +(string model_result) candle_model_train(string params, + string expid, + string runid, + string model_name) +{ + + string model_sh = getenv("MODEL_SH"); + string turbine_output = getenv("TURBINE_OUTPUT"); + + string outdir; + + outdir = "%s/%s" % (turbine_output, runid); + // outdir = "%s/%s/Output/%s/%s" % (turbine_output, model_name, expid, runid); + + printf("candle_model_train_app(): running model shell in: %s", + outdir); + + // We do not use a file type here because this file may not be created, + // which is handled by get_results() + string result_file = outdir/"result.txt"; + wait (run_model(model_sh, params, expid, runid)) + { + model_result = get_results(result_file); + } + printf("candle_model_train_app: result(%s): '%s'", runid, model_result); +} + +/** + Swift/T app function that runs the Benchmark +*/ +app (void o) run_model (string model_sh, string params, + string expid, string runid) +{ + // 1 2 3 4 5 6 7 + "bash" model_sh FRAMEWORK params expid runid "BENCHMARK" model_name "train"; +} + +/** + Extracts the Benchmark output if it exists, + else, provides a NaN so the workflow can keep running +*/ +(string model_result) get_results(string result_file) { + if (file_exists(result_file)) { + file line = input(result_file); + model_result = trim(read(line)); + } else { + printf("File not found: %s", result_file); + model_result = "NaN"; + } +} diff --git a/workflows/common/swift/model_container.swift b/workflows/common/swift/model_container.swift new file mode 100644 index 00000000..57f41103 --- /dev/null +++ b/workflows/common/swift/model_container.swift @@ -0,0 +1,61 @@ + +/** + CANDLE MODEL: CONTAINER + Runs CANDLE models as Swift/T app functions + under a Singularity container +*/ + +/** + The main objective function used by the CANDLE/Supervisor + model exploration (optimization) loop. + params : The JSON string of params to be passed to the Benchmark + run_id : A string run ID that will be the output directory name + model_name : A path to a SIF +*/ +(string model_result) candle_model_train(string params, + string expid, + string runid, + string model_name) +{ + CDD = getenv("CANDLE_DATA_DIR"); + model_sh = getenv("MODEL_SH"); + + model_token = rootname_string(basename_string(model_name)); + outdir = "%s/%s/Output/%s/%s" % (CDD, model_token, expid, runid); + printf("candle_model_train_container(): running in: %s", outdir); + + // We do not use a file type here because this file may not be created, + // which is handled by get_results() + result_file = outdir/"result.txt"; + wait (run_model_train(model_sh, params, expid, runid, model_name)) + { + model_result = get_results(result_file); + } + printf("candle_model_train_container(): result(%s): '%s'", + runid, model_result); +} + +/** + Swift/T app function that runs the Benchmark +*/ +app (void o) run_model_train(string model_sh, string params, + string expid, string runid, + string model_name) +{ + // 1 2 3 4 5 6 7 + "bash" model_sh FRAMEWORK params expid runid "SINGULARITY" model_name "train"; +} + +/** + Extracts the model result if it exists, + else, provides a NaN so the workflow can keep running +*/ +(string model_result) get_results(string result_file) { + if (file_exists(result_file)) { + file line = input(result_file); + model_result = trim(read(line)); + } else { + printf("File not found: %s", result_file); + model_result = "NaN"; + } +} diff --git a/workflows/common/swift/obj_echo.swift b/workflows/common/swift/model_echo.swift similarity index 64% rename from workflows/common/swift/obj_echo.swift rename to workflows/common/swift/model_echo.swift index d72c492e..7a8bb50e 100644 --- a/workflows/common/swift/obj_echo.swift +++ b/workflows/common/swift/model_echo.swift @@ -1,5 +1,8 @@ -// OBJ ECHO +/** + CANDLE MODEL: CONTAINER + Pretends to run CANDLE models, actually just echoes its arguments +*/ /** This has the same signature as the main objective function @@ -8,8 +11,11 @@ params : The JSON string of params to be passed to the Benchmark run_id : A string run ID that will be the output directory name */ -(string obj_result) obj(string params_in, - string run_id) { +(string model_result) candle_model_train(string params, + string expid, + string runid, + string model_name) +{ string model_sh = getenv("MODEL_SH"); string turbine_output = getenv("TURBINE_OUTPUT"); @@ -18,5 +24,5 @@ // 1 2 3 printf("bash model.sh %s %s %s in: %s", FRAMEWORK, params, run_id, turbine_output) => - obj_result = "ECHO SUCCESS"; + model_result = "ECHO SUCCESS"; } diff --git a/workflows/common/swift/model_py.swift b/workflows/common/swift/model_py.swift new file mode 100644 index 00000000..5d798e7b --- /dev/null +++ b/workflows/common/swift/model_py.swift @@ -0,0 +1,59 @@ + +/** + CANDLE MODEL: PY + Runs CANDLE models as Swift/T python() functions +*/ + +string code_template = +---- +try: + import json + import os + import sys + import traceback + import model_runner + + sys.argv = [ 'python' ] + import tensorflow + from tensorflow import keras + + model_result = 'NaN' + outdir = '%s' + + if not os.path.exists(outdir): + os.makedirs(outdir) + + J = """%s""" + hyper_parameter_map = json.loads(J) + hyper_parameter_map['framework'] = 'keras' + hyper_parameter_map['framework'] = 'keras' + hyper_parameter_map['save'] = '{}/output'.format(outdir) + hyper_parameter_map['instance_directory'] = outdir + hyper_parameter_map['model_name'] = '%s' + hyper_parameter_map['experiment_id'] = '%s' + hyper_parameter_map['run_id'] = '%s' + hyper_parameter_map['timeout'] = %d + + model_result, history = model_runner.run_model(hyper_parameter_map) + +except Exception as e: + info = sys.exc_info() + s = traceback.format_tb(info[2]) + sys.stdout.write('\\n\\nEXCEPTION in candle_model_train(): \\n' + + repr(e) + ' ... \\n' + ''.join(s)) + sys.stdout.write('\\n') + sys.stdout.flush() + model_result = 'EXCEPTION' +----; + +(string model_result) candle_model_train(string params, + string expid, + string runid, + string model_name) +{ + string outdir = "%s/run/%s" % (turbine_output, runid); + string code = code_template % (outdir, params, model_name, + expid, runid, benchmark_timeout); + model_result = python_persist(code, "str(model_result)"); + printf("model_py:candle_model_train(): model_result: '%s'", model_result); +} diff --git a/workflows/common/swift/obj_py.swift b/workflows/common/swift/obj_abstention_py.swift similarity index 87% rename from workflows/common/swift/obj_py.swift rename to workflows/common/swift/obj_abstention_py.swift index b058d924..89075f72 100644 --- a/workflows/common/swift/obj_py.swift +++ b/workflows/common/swift/obj_abstention_py.swift @@ -7,9 +7,9 @@ string code_template = ---- try: import sys, traceback, json, os - import model_runner - import tensorflow - from tensorflow import keras + import model_abstention_runner + import tensorflow + from tensorflow import keras obj_result = '-100' outdir = '%s' @@ -26,7 +26,7 @@ try: hyper_parameter_map['run_id'] = '%s' hyper_parameter_map['timeout'] = %d - obj_result, history = model_runner.run_model(hyper_parameter_map) + obj_result, history = model_abstention_runner.run_model(hyper_parameter_map) except Exception as e: info = sys.exc_info() diff --git a/workflows/common/swift/obj_app.swift b/workflows/common/swift/obj_app.swift deleted file mode 100644 index 0fcc49fe..00000000 --- a/workflows/common/swift/obj_app.swift +++ /dev/null @@ -1,73 +0,0 @@ - -// OBJ APP - -/** - The main objective function used by the CANDLE/Supervisor - model exploration (optimization) loop. - params : The JSON string of params to be passed to the Benchmark - run_id : A string run ID that will be the output directory name -*/ -(string obj_result) obj(string params, - string run_id) { - string model_sh = getenv("MODEL_SH"); - string turbine_output = getenv("TURBINE_OUTPUT"); - - string outdir = "%s/run/%s" % (turbine_output, run_id); - // printf("running model shell script in: %s", outdir); - // We do not use a file type here because this file may not be created, - // which is handled by get_results() - string result_file = outdir/"result.txt"; - wait (run_model(model_sh, params, run_id)) - { - obj_result = get_results(result_file); - } - printf("result(%s): %s", run_id, obj_result); -} - -/** - The main objective function used by the CANDLE/Supervisor - model exploration (optimization) loop. - params : The JSON string of params to be passed to the Benchmark - run_id : A string run ID that will be the output directory name -*/ -(string obj_result) obj_prio(string params, - string run_id, int prio) { - string model_sh = getenv("MODEL_SH"); - string turbine_output = getenv("TURBINE_OUTPUT"); - - string outdir = "%s/run/%s" % (turbine_output, run_id); - // printf("running model shell script in: %s", outdir); - // We do not use a file type here because this file may not be created, - // which is handled by get_results() - string result_file = outdir/"result.txt"; - wait (@prio=prio run_model(model_sh, params, run_id)) - { - obj_result = get_results(result_file); - } - // printf("result(%s): %s", run_id, obj_result); -} - -/** - Swift/T app function that runs the Benchmark -*/ -app (void o) run_model (string model_sh, string params, - string runid) -{ - // 1 2 3 - "bash" model_sh FRAMEWORK params runid; -} - -/** - Extracts the Benchmark output if it exists, - else, provides a NaN so the workflow can keep running -*/ -(string obj_result) get_results(string result_file) { - if (file_exists(result_file)) { - file line = input(result_file); - obj_result = trim(read(line)); - } else { - printf("File not found: %s", result_file, " - benchmark might have stopped without completing/returning history variable."); - // return with a large value - obj_result = "1e7"; - } -} diff --git a/workflows/cp-leaveout/.gitignore b/workflows/cp-leaveout/.gitignore index 61e34c95..f4a85539 100644 --- a/workflows/cp-leaveout/.gitignore +++ b/workflows/cp-leaveout/.gitignore @@ -1,3 +1,4 @@ experiments turbine-output +*.data *.pkl diff --git a/workflows/cp-leaveout/README-chained.md b/workflows/cp-leaveout/README-chained.md index 64d4d335..102a8e46 100644 --- a/workflows/cp-leaveout/README-chained.md +++ b/workflows/cp-leaveout/README-chained.md @@ -1,41 +1,40 @@ -# Challenge Problem: Leave Out - Job Chained Workflow # +# Challenge Problem: Leave Out - Job Chained Workflow This workflow runs the CP Leave Out workflow using job chaining. Each stage of the workflow will be submitted as a separate job where subsequent stages are only run when the previous job on which they depend has successfully completed. -For example, if the workflow configuration consists of an initial 4 Uno model runs, and a -subsequent 16 model runs where each of those model runs require the trained weights -of one of the initial 4 as input, then the first 4 will be submitted as a job, and -the second 16 as a job that will only begin running when the first has successfully +For example, if the workflow configuration consists of an initial 4 Uno model runs, and a +subsequent 16 model runs where each of those model runs require the trained weights +of one of the initial 4 as input, then the first 4 will be submitted as a job, and +the second 16 as a job that will only begin running when the first has successfully completed. ## Requirements -* Check out Benchmarks branch loocv into a compute-node writeable directory, - e.g., /gpfs/alpine/med106/scratch/$USER - * Edit uno_baseline_keras2.py to replace uno_default_model.txt with uno_auc_model.txt - * Set `BENCHMARKS_ROOT` in your submission script (see below), -e.g., test-1.sh, to this compute node writable Benchmarks directory. -* The following data files are required: - * A plan json file (e.g., `plangen_cell1593-p4_drug1779-p1.json`) - * A dataframe file (e.g., `top_21.res_reg.cf_rnaseq.dd_dragon7.labled.feather`), a feather or parquet - file will be faster. +- Check out Benchmarks branch loocv into a compute-node writeable directory, + e.g., /gpfs/alpine/med106/scratch/$USER + - Edit uno_baseline_keras2.py to replace uno_default_model.txt with uno_auc_model.txt + - Set `BENCHMARKS_ROOT` in your submission script (see below), + e.g., test-1.sh, to this compute node writable Benchmarks directory. +- The following data files are required: + - A plan json file (e.g., `plangen_cell1593-p4_drug1779-p1.json`) + - A dataframe file (e.g., `top_21.res_reg.cf_rnaseq.dd_dragon7.labled.feather`), a feather or parquet + file will be faster. +## Running the Workflow -## Running the Workflow ## - -Sample files for configuring and running the workflow are in the `test-chained` directory. +Sample files for configuring and running the workflow are in the `test-chained` directory. The workflow itself is launched using the python script `py/run_chained.py`. Essentially, `run_chained.py` does the following: 1. Reads a configuration file specifying what data files to use, how many stages to run, -and how to configure each of those stages (e.g. PROCS, WALLTIME, etc.), + and how to configure each of those stages (e.g. PROCS, WALLTIME, etc.), 2. Generates a UPF file for each stage where each UPF file contains the node ids to run for that stage, -3. Runs each stage as a separate UPF-style workflow job, managing the job and parent model weight location dependencies appropriately. +3. Runs each stage as a separate UPF-style workflow job, managing the job and parent model weight location dependencies appropriately. Each individual stage job submission launched by `run_chained.py` follows the pattern of the other Supervisor workflows where -a *test* submission script is executed which in turn sources *sys* and *prm* configurations, and then -calls another script (e.g., `swift/cpl-upf-workflow.sh`) that performs further configuration and executes the swift script +a _test_ submission script is executed which in turn sources _sys_ and _prm_ configurations, and then +calls another script (e.g., `swift/cpl-upf-workflow.sh`) that performs further configuration and executes the swift script (e.g., `swift/cpl-upf-workflow.swift`). `run_chained.py` performs this individual job submission for each stage by: @@ -53,17 +52,17 @@ usage: run_chained.py [-h] --config CONFIG [--stages STAGES] [--dry_run] [--first_stage_parent_directory FIRST_STAGE_PARENT_DIRECTORY] ``` -* --config - the path of the workflow configuration file -* --stages - the number of stages to run. This will override the value specified in the configuration file -* --dry_run - executes the workflow, displaying the configuration for each stage, but does **not** submit any jobs -* --first_stage - the stage at which to start the workflow. The stage count starts with *1* and a `first_stage` of *1* corresponds to the initial parentless stage. This will override the value specified in the configuration file -* --first_stage_parent_directory - the file system location of the first stage's parent stage, when `first_stage` is greater than 1. This will override the value specified in the configuration file +- --config - the path of the workflow configuration file +- --stages - the number of stages to run. This will override the value specified in the configuration file +- --dry_run - executes the workflow, displaying the configuration for each stage, but does **not** submit any jobs +- --first*stage - the stage at which to start the workflow. The stage count starts with \_1* and a `first_stage` of _1_ corresponds to the initial parentless stage. This will override the value specified in the configuration file +- --first_stage_parent_directory - the file system location of the first stage's parent stage, when `first_stage` is greater than 1. This will override the value specified in the configuration file Of these only `--config` is required. The `first_stage` argument can be used to continue a previously run job chaining workflow. For example, if the previous workflow ran stages 1 and 2. Then a `first_stage` argument of 3 and -a `first_stage_parent_directory` argument that points to the experiment directory of the previously run stage 2 will continue the previous workflow starting at stage 3. +a `first_stage_parent_directory` argument that points to the experiment directory of the previously run stage 2 will continue the previous workflow starting at stage 3. `run_chained.py` should be run from within the test-chained directory. @@ -71,24 +70,23 @@ a `first_stage_parent_directory` argument that points to the experiment director The configuration file has the following json format (see `test-chained/cfg.json` for an example): -* site: the name of the hpc site (e.g. "summit") -* plan: the path to the challenge problem leave one out plan file -* submit_script: the script used for the individual stage job submission (e.g. test-chained/test-1.sh) -* upf_directory: the directory where the upf files are written out to -* stages: the number of stages to run. -1 = run all the stages -* first_stage: the stage at which to start the workflow. A value of 1 means the initial parentless stage. -* first_stage_parent_directory: the file system location of the first stage's parent stage, when `first_stage` is greater than 1. -* stage_cfg_script: the staget configuration script (e.g. `test-chained/cfg-stage-sys.sh`) sourced by the -submit script to set the configuration (WALLTIME etc.) for each individual stage run. -Environment variables specified in the "stage_cfgs" (see below) will override those in this file. -* stage_cfgs: a list of optional stage configurations, where each configuration is a json map. By default, if no -stage configuration is defined for a particular stage or PROCS and PPN are not defined in that -stage configuration, then PROCS will be set to the number of plan nodes to run (i.e., the length of the UPF file) + 1 and PPN will be set to 1. In this way, the default is to run all the Uno model runs -concurrently. For the other environment variables in a stage configuration, the defaults in the -stage_cfg_script will be used. All the key value pairs in a stage configuration except for *stage* are preserved as environment variables when the submit_script is called and will override those (e.g., WALLTIME, etc.) in the stage_cfg_script. A stage configuration map can have the following entries. - * stage: the stage number - * X: where X is an environment variable from the stage_cfg_script, e.g. WALLTIME, PROCS, PPN, etc. - +- site: the name of the hpc site (e.g. "summit") +- plan: the path to the challenge problem leave one out plan file +- submit_script: the script used for the individual stage job submission (e.g. test-chained/test-1.sh) +- upf_directory: the directory where the upf files are written out to +- stages: the number of stages to run. -1 = run all the stages +- first_stage: the stage at which to start the workflow. A value of 1 means the initial parentless stage. +- first_stage_parent_directory: the file system location of the first stage's parent stage, when `first_stage` is greater than 1. +- stage_cfg_script: the staget configuration script (e.g. `test-chained/cfg-stage-sys.sh`) sourced by the + submit script to set the configuration (WALLTIME etc.) for each individual stage run. + Environment variables specified in the "stage_cfgs" (see below) will override those in this file. +- stage*cfgs: a list of optional stage configurations, where each configuration is a json map. By default, if no + stage configuration is defined for a particular stage or PROCS and PPN are not defined in that + stage configuration, then PROCS will be set to the number of plan nodes to run (i.e., the length of the UPF file) + 1 and PPN will be set to 1. In this way, the default is to run all the Uno model runs + concurrently. For the other environment variables in a stage configuration, the defaults in the + stage_cfg_script will be used. All the key value pairs in a stage configuration except for \_stage* are preserved as environment variables when the submit_script is called and will override those (e.g., WALLTIME, etc.) in the stage_cfg_script. A stage configuration map can have the following entries. + - stage: the stage number + - X: where X is an environment variable from the stage_cfg_script, e.g. WALLTIME, PROCS, PPN, etc. ### An Example Run @@ -114,9 +112,9 @@ Resovled Stage Configuration: PPN: 1 WALLTIME: 01:00:00 TURBINE_DIRECTIVE: \n#BSUB -alloc_flags "NVME maximizegpfs"\n## JOB 0 - TURBINE_LAUNCH_OPTIONS: -a1 -c42 -g1 + TURBINE_LAUNCH_OPTIONS: -a1 -c42 -g1 BENCHMARK_TIMEOUT: -1 - SH_TIMEOUT: + SH_TIMEOUT: IGNORE_ERRORS: 0 CPL-UPF-WORKFLOW.SH: Running model: uno for EXPID: X134 sourcing /autofs/nccs-svm1_proj/med106/ncollier/repos/Supervisor/workflows/common/sh/env-summit.sh @@ -157,9 +155,9 @@ Resovled Stage Configuration: PPN: 1 WALLTIME: 00:45:00 TURBINE_DIRECTIVE: \n#BSUB -alloc_flags "NVME maximizegpfs"\n#BSUB -w done(704496) - TURBINE_LAUNCH_OPTIONS: -a1 -c42 -g1 + TURBINE_LAUNCH_OPTIONS: -a1 -c42 -g1 BENCHMARK_TIMEOUT: -1 - SH_TIMEOUT: + SH_TIMEOUT: IGNORE_ERRORS: 0 CPL-UPF-WORKFLOW.SH: Running model: uno for EXPID: X135 sourcing /autofs/nccs-svm1_proj/med106/ncollier/repos/Supervisor/workflows/common/sh/env-summit.sh @@ -215,9 +213,9 @@ Resovled Stage Configuration: PPN: 1 WALLTIME: 01:00:00 TURBINE_DIRECTIVE: \n#BSUB -alloc_flags "NVME maximizegpfs"\n## JOB 0 - TURBINE_LAUNCH_OPTIONS: -a1 -c42 -g1 + TURBINE_LAUNCH_OPTIONS: -a1 -c42 -g1 BENCHMARK_TIMEOUT: -1 - SH_TIMEOUT: + SH_TIMEOUT: IGNORE_ERRORS: 0 @@ -228,8 +226,8 @@ Resovled Stage Configuration: PPN: 1 WALLTIME: 00:45:00 TURBINE_DIRECTIVE: \n#BSUB -alloc_flags "NVME maximizegpfs"\n#BSUB -w done() - TURBINE_LAUNCH_OPTIONS: -a1 -c42 -g1 + TURBINE_LAUNCH_OPTIONS: -a1 -c42 -g1 BENCHMARK_TIMEOUT: -1 - SH_TIMEOUT: + SH_TIMEOUT: IGNORE_ERRORS: 0 -``` \ No newline at end of file +``` diff --git a/workflows/cp-leaveout/db/README.adoc b/workflows/cp-leaveout/db/README.adoc new file mode 100644 index 00000000..770c5779 --- /dev/null +++ b/workflows/cp-leaveout/db/README.adoc @@ -0,0 +1,37 @@ + +== DB Tools + +Tools for the CP SQLite DB. + +=== print-db + +Dump a DB file to text output + +---- +$ ./print-db.sh workflow-1.db +---- + +=== diff-dbs + +Show difference between two DB files + +---- +$ ./diff-dbs.sh workflow-1.db workflow-2.db +---- + +=== print-stats + +Show short DB stats. + +---- +$ ./print-stats.sh workflow-1.db +COMPLETE / TOTAL = 1364 / 1364 : 0 remaining. +---- + +=== reset-node + +Reset (delete) DB nodes, forcing them to be re-run + +---- +$ db/reset-node.sh experiments/X085/restarts-1/cplo.db 1.2.3.2 +---- diff --git a/workflows/cp-leaveout/db/diff-dbs.sh b/workflows/cp-leaveout/db/diff-dbs.sh index a208750e..78026b08 100755 --- a/workflows/cp-leaveout/db/diff-dbs.sh +++ b/workflows/cp-leaveout/db/diff-dbs.sh @@ -23,4 +23,3 @@ sqlite3 $DB2 < $THIS/print-db.sql > $TXT2 diff $TXT1 $TXT2 rm $TXT1 $TXT2 - diff --git a/workflows/cp-leaveout/db/print-db.sh b/workflows/cp-leaveout/db/print-db.sh index edc00b4f..d3fa8868 100755 --- a/workflows/cp-leaveout/db/print-db.sh +++ b/workflows/cp-leaveout/db/print-db.sh @@ -13,4 +13,5 @@ DB=$1 THIS=$( readlink --canonicalize $( dirname $0 ) ) +echo DB: $DB sqlite3 $DB < $THIS/print-db.sql diff --git a/workflows/cp-leaveout/db/print-stats.sh b/workflows/cp-leaveout/db/print-stats.sh index 0d804a88..20d8edb1 100755 --- a/workflows/cp-leaveout/db/print-stats.sh +++ b/workflows/cp-leaveout/db/print-stats.sh @@ -1,4 +1,5 @@ #!/bin/sh +set -eu # PRINT STATS SH @@ -10,13 +11,27 @@ fi DB=$1 -COMPLETE=$( +if ! which sqlite3 > /dev/null +then + echo "print-stats.sh: Add sqlite3 to PATH!" + exit 1 +fi + +echo DB: $DB + +COMPLETE=$( sqlite3 $DB < 5 ); EOF -# update runhist SET status="RESET" where (length(subplan_id) > 5 ); + +# UPDATE runhist SET status="RESET" WHERE (subplan_id LIKE "${NODE}%") ; +# EOF diff --git a/workflows/cp-leaveout/py/README.md b/workflows/cp-leaveout/py/README.md index b195320b..d4f6a18f 100644 --- a/workflows/cp-leaveout/py/README.md +++ b/workflows/cp-leaveout/py/README.md @@ -1,36 +1,38 @@ -# Uno: Milestone 13 Transfer Learning +# Uno: Milestone 13 Transfer Learning + This README discusses the use of the `plangen.py` script to partition feature sets for experiments with large scale transfer learning and parallel model training. The utility does the following: -* Accept text files containing lists of feature names of arbitray length, each is called a feature-set -* Generate unique combinations of features from each feature set, setting the stage for transfer learning (partitioning) -* Construct a tree depicting how successive, parallel training sessions can be scheduled upon the completion of a predecessor/parent (planning) +- Accept text files containing lists of feature names of arbitray length, each is called a feature-set +- Generate unique combinations of features from each feature set, setting the stage for transfer learning (partitioning) +- Construct a tree depicting how successive, parallel training sessions can be scheduled upon the completion of a predecessor/parent (planning) ## Overview + A number of partitioning schemes and data representation strategies have been discussed. The focus here is the configuration agreed upon at the May 2019 CANDLE hack-a-thon. Specifically: -* There are two feature sets, cell-lines and drugs. -* In a prototype implementation, each feature-set will contain eight entries. The target configuration will have 1000 cell features and 1000 drug features. -* Partitioning is accomplished by recursively splitting the cell vs drug graph into quadrants. -* Each such partitioning presents four training opportunities, each uniquely combines three quadrants and omits one. -* The omitted quadrant defines validation data for the training run. Partitioning/planning recurs on this quadrant to define successors. -* The four training operations can be scheduled to run in parallel once the training of their common parent completes. -* The partitioning scheme as well as the training parent/child relationships will be expressed in a JSON document. +- There are two feature sets, cell-lines and drugs. +- In a prototype implementation, each feature-set will contain eight entries. The target configuration will have 1000 cell features and 1000 drug features. +- Partitioning is accomplished by recursively splitting the cell vs drug graph into quadrants. +- Each such partitioning presents four training opportunities, each uniquely combines three quadrants and omits one. +- The omitted quadrant defines validation data for the training run. Partitioning/planning recurs on this quadrant to define successors. +- The four training operations can be scheduled to run in parallel once the training of their common parent completes. +- The partitioning scheme as well as the training parent/child relationships will be expressed in a JSON document. ## Running the script -`plangen.py` arguments are defined in `planargs.py`. `sample-command-line` is a script that demonstrates the parameters used to accomplish the objectives outlined above. Refer to that sample when reading the argument descriptions below. `--help` gives a brief summary of all arguments. +`plangen.py` arguments are defined in `planargs.py`. `sample-command-line` is a script that demonstrates the parameters used to accomplish the objectives outlined above. Refer to that sample when reading the argument descriptions below. `--help` gives a brief summary of all arguments. The critical parameters are `--fs_names`, `--fs_paths` and `--fs_parts`. In each `fs` stands for feature_set. Each parameter is required and each must specify the same number values. `--fs_names` takes two or more values providing feature set names such as `cells` and `drugs`. -`fs_paths` takes path sepecifications for the corresponding feature-set files. All of the usual file search rules apply, they can be relative or absolute paths. Optionally, `--in_dir` can be used to provide common high-level qualification. +`fs_paths` takes path sepecifications for the corresponding feature-set files. All of the usual file search rules apply, they can be relative or absolute paths. Optionally, `--in_dir` can be used to provide common high-level qualification. -`fs_parts` defines the partitioning scheme for each of the feature-sets. So in our scenario above, `--fs_parts 2 2` specifies that at each iteration, both the `cells` and `drugs` feature-sets will be halved, giving the quadrants discussed above at each iteration. Non-symetric partitioning may prove useful when the number of feature-set line items diverges from the "square" model. +`fs_parts` defines the partitioning scheme for each of the feature-sets. So in our scenario above, `--fs_parts 2 2` specifies that at each iteration, both the `cells` and `drugs` feature-sets will be halved, giving the quadrants discussed above at each iteration. Non-symetric partitioning may prove useful when the number of feature-set line items diverges from the "square" model. `--in_dir` is optional. It can be used to simplify the coding of `--fs_paths` path names. The rules of os.path.join() apply. `--out_dir` is optional. It can be used to place output files, the JSON format plan in particular, to a specific directory. -`--debug` is optional. If specified, the final plan dictionary is pretty-printed. This is quite a bit easier to read than the JSON file. +`--debug` is optional. If specified, the final plan dictionary is pretty-printed. This is quite a bit easier to read than the JSON file. `--test` is optional. If specified, a demonstration of the plan tree navigation API is . See below. @@ -40,7 +42,7 @@ plangen.cell8-p2.drug8-p2.json is a sample plan constructed using cell and drug ## Plan tree navigation and content retrieval -Given a JSON-format "plan tree" generated by `plangen.py` and loaded by `load_plan()`, the navigation and retrieval functions described below are used to navigate predecessor/successor (i.e. parent/child) relationships useful for synchronizing transfer learning training suites. A plan tree is a true tree. It has a single "root" node at its origin and any number of successor nodes. The root is the predecessor of these successors. Every node in the tree, except for the root, has a single predecessor and zero or more successors. In a transfer learning environment, the successors of a given training session inherit the model of their predecessor once that predecessor completes. +Given a JSON-format "plan tree" generated by `plangen.py` and loaded by `load_plan()`, the navigation and retrieval functions described below are used to navigate predecessor/successor (i.e. parent/child) relationships useful for synchronizing transfer learning training suites. A plan tree is a true tree. It has a single "root" node at its origin and any number of successor nodes. The root is the predecessor of these successors. Every node in the tree, except for the root, has a single predecessor and zero or more successors. In a transfer learning environment, the successors of a given training session inherit the model of their predecessor once that predecessor completes. Each plan tree node is named (it is a dictionary) - the root node is usually named '1' and its value is a dictionary of metadata including the arguments that were used to generate it. Use the `get_node()` function without the `node_name` argument to acquire the root name and its associated metadata. All successor node names are derived from their parent by appending sequence numbers separated by a delmiter - but this is of no concern to a program navigating the tree, the names are opaque. @@ -53,7 +55,6 @@ Each plan tree node is named (it is a dictionary) - the root node is usually nam ## Contact Richard Turgeon - -Created: 2019-06-07 + +Created: 2019-06-07 Modified: 2019-06-18 - diff --git a/workflows/cp-leaveout/py/data_setup.py b/workflows/cp-leaveout/py/data_setup.py index 0b2a9201..9e14c659 100644 --- a/workflows/cp-leaveout/py/data_setup.py +++ b/workflows/cp-leaveout/py/data_setup.py @@ -1,92 +1,173 @@ - # DATA SETUP PY -import json +import datetime import os - -from pathlib import Path +import sys +import time import traceback -from runner_utils import ModelResult +from pathlib import Path + import topN_to_uno +from runner_utils import ModelResult +from log_tools import * + + +logger = get_logger(logger, "DATA SETUP") + +logger.info("LOAD:") +sys.stdout.flush() + class TopN_Args: + def __init__(self, dataframe_from, node, plan, output): self.dataframe_from = dataframe_from self.node = node self.plan = plan self.fold = None - self.incremental = 'True' - self.output = output + self.incremental = "True" + self.cell_feature_selection = None + self.drug_feature_selection = None + self.output = output + + +def setup_local_fs(params): + global logger + # username = os.environ['USER'] # No longer works on Summit 2021-10-13 + username = params["user"] + userdir = Path("/mnt/bb/%s" % username) + nvme_enabled = userdir.exists() + logger.info("NVMe: %r" % nvme_enabled) + if not nvme_enabled: + return params + # The training data directory for this workflow node: + nodedir = userdir / params["node"] + os.makedirs(nodedir, exist_ok=True) + # copy original datafrom to NVMe + try: + src = Path(params["dataframe_from"]) + local_orig = userdir / src.name + local_train = nodedir / Path("topN.uno.h5") + dest = Path(local_orig) + if not dest.exists(): + start = time.time() + count = dest.write_bytes(src.read_bytes()) + stop = time.time() + duration = stop - start + rate = count / duration / (1024 * 1024) + logger.info("Original dataframe copied to NVM in " + + "%0.1f seconds (%0.1f MB/s)." % (duration, rate)) + else: + # Report file size: + stats = os.stat(local_orig) + logger.info("Original dataframe already exists in NVM: size=%i" % + stats.st_size) + except Exception as e: + print("Error occurred in copying original dataframe\n" + str(e)) + sys.stdout.flush() + traceback.print_exc() + sys.stdout.flush() + return ModelResult.ERROR + params["dataframe_from"] = dest.resolve() + # WARNING: this changes the location of the training data: + params["dataframe_from"] = local_orig + params["use_exported_data"] = local_train + params["plan"] = str(userdir / Path(params["plan"]).name) + logger.info("Using plan file: " + params["plan"]) + return params + def pre_run(params): - import sys, time - print("data_setup.pre_run(): node: '%s' ..." % params["node"]) - sys.stdout.flush() + global logger - # check NVMe disk is available - username = os.environ['USER'] - nvme_enabled = Path('/mnt/bb/{}'.format(username)).exists() + logger.info("PRE_RUN node: %s ..." % params["node"]) - if nvme_enabled: - # copy original datafrom to NVMe disk space - try: - src = Path(params["dataframe_from"]) - dest = Path("/mnt/bb/{}/{}".format(username, src.name)) - if not dest.exists(): - start = time.time() - count = dest.write_bytes(src.read_bytes()) - stop = time.time() - duration = stop - start - rate = count / duration / (1024*1024) - print("File copy completed. Original dataframe " + - "copied to NVM in %0.1f seconds (%0.1f MB/s)." % - (duration, rate)) - else: - print("File copy skipped. Original dataframe already exists in NVM.") - except Exception as e: - print("Error occurred in copying original dataframe\n" + str(e)) - traceback.print_exc() - return ModelResult.ERROR - params["dataframe_from"] = dest.resolve() - params["use_exported_data"] = "/mnt/bb/{}/{}".format(username, params["use_exported_data"]) - # softlink to cache & config file # build node specific training/validation dataset - - args = TopN_Args(params["dataframe_from"], - params["node"], - params["plan"], - params["use_exported_data"]) + + params = setup_local_fs(params) + + args = TopN_Args( + params["dataframe_from"], + params["node"], + params["plan"], + output=params["use_exported_data"], + ) data = params["benchmark_data"] try: - for filename in [ "cache", "uno_auc_model.txt" ]: + for filename in ["uno_auc_model.txt"]: # "cache", if not os.path.islink(filename): - os.symlink(f"{data}/{filename}", filename) + src = f"{data}/{filename}" + logger.info("data_setup: src: (%s)" % src) + logger.info("data_setup: dest: (%s)" % filename) + os.symlink(src, filename) except Exception as e: - print("data_setup: error making symlink: %s\n" % filename + str(e)) + print("data_setup: error making symlink:") + print("data_setup: pwd: " + os.getcwd()) + print("data_setup: src: (%s)" % src) + print("data_setup: dest: (%s)" % filename) + print(str(e)) + sys.stdout.flush() return ModelResult.ERROR try: - print("data_setup: build_dataframe() ...") - start = time.time() - topN_to_uno.build_dataframe(args) - stop = time.time() - duration = stop - start - print("data_setup: build_dataframe() OK : " + - "%0.1f seconds." % duration) + logger.info("build_dataframe(output=%s) ..." % args.output) + sys.stdout.flush() + if not os.path.exists(args.output): + out_orig = args.output + args.output = Path(str(out_orig) + ".part") + start = time.time() + topN_to_uno.build_dataframe(args) + stop = time.time() + duration = stop - start + logger.info("build_dataframe() OK : " + + "%0.1f seconds." % duration) + # sys.stdout.flush() + os.rename(args.output, out_orig) + logger.info("rename() OK") + # sys.stdout.flush() + args.output = out_orig + else: + logger.info("data_setup: dataframe exists: %s" % + os.path.realpath(args.output)) + except topN_to_uno.topN_NoDataException: + logger.info("data_setup: topN_NoDataException: SKIP " + + "node: %s" % params["node"]) + # sys.stdout.flush() + directory = params["instance_directory"] + with open(directory + "/NO-DATA.txt", "a") as fp: + ts = datetime.datetime.now() + iso = ts.isoformat(sep=" ", timespec="seconds") + fp.write(iso + "\n") + return ModelResult.SKIP except ValueError: - print("data_setup: caught ValueError for node: '%s'" % - params["node"]) # new 2019-12-02 + print("data_setup: caught ValueError for node: '%s'" % params["node"]) + sys.stdout.flush() traceback.print_exc(file=sys.stdout) - return ModelResult.SKIP + return ModelResult.ERROR except Exception as e: print("data_setup: error in build_dataframe!\n" + str(e)) - traceback.print_exc() + sys.stdout.flush() + traceback.print_exc(file=sys.stdout) + sys.stdout.flush() return ModelResult.ERROR - print("data_setup.pre_run() done.") + logger.info("PRE_RUN done.") + # sys.stdout.flush() return ModelResult.SUCCESS + def post_run(params, output_dict): - print("post_run") + global logger + # logger.info("post_run") + # sys.stdout.flush() + if "use_exported_data" in params: + try: + # os.remove(params["use_exported_data"]) + pass + except OSError as e: + print("Error: %s - %s." % (e.filename, e.strerror)) + else: + # print("use_exported_data not in params") + pass return ModelResult.SUCCESS diff --git a/workflows/cp-leaveout/py/planargs.py b/workflows/cp-leaveout/py/planargs.py index 27a1cd60..2bb6b785 100644 --- a/workflows/cp-leaveout/py/planargs.py +++ b/workflows/cp-leaveout/py/planargs.py @@ -1,90 +1,107 @@ -""" -plangen command line argument definitions -""" +"""plangen command line argument definitions.""" +import argparse +import glob import os import sys -import glob -import argparse -partitioning_strategies = ['leaveout', 'undefined1', 'undefined2'] # to be completed ????????????? +partitioning_strategies = [ + "leaveout", + "undefined1", + "undefined2", +] # to be completed ????????????? + def parse_arguments(): - parser = argparse.ArgumentParser( - description='feature-set partioning' - ) + parser = argparse.ArgumentParser(description="feature-set partioning") - parser.add_argument('--in_dir', + parser.add_argument("--in_dir", type=str, - help='Directory containing feature-set list files') + help="Directory containing feature-set list files") - parser.add_argument('--out_dir', - default='results', - type=str, - help='Directory to contain generated plan files') + parser.add_argument( + "--out_dir", + default="results", + type=str, + help="Directory to contain generated plan files", + ) - parser.add_argument('--json', - action='store_true', - help='Generate plan in JSON format') + parser.add_argument("--json", + action="store_true", + help="Generate plan in JSON format") - parser.add_argument('--overwrite', - action='store_true', - help='Accept non-empty out_dir, contents overwritten') + parser.add_argument( + "--overwrite", + action="store_true", + help="Accept non-empty out_dir, contents overwritten", + ) - parser.add_argument ('--partition_strategy', - choices=partitioning_strategies, - default=partitioning_strategies[0], - help='Specify a feature-set partitioning strategy') + parser.add_argument( + "--partition_strategy", + choices=partitioning_strategies, + default=partitioning_strategies[0], + help="Specify a feature-set partitioning strategy", + ) # The following fs_* arguments are required, the number of values specified for each - # must match, and at least two values are required for each - - parser.add_argument('--fs_names', - required=True, - type=str, - nargs='+', - help='Specify a list of (arbitrary) feature-set names') + # must match, and at least two values are required for each + + parser.add_argument( + "--fs_names", + required=True, + type=str, + nargs="+", + help="Specify a list of (arbitrary) feature-set names", + ) - parser.add_argument('--fs_paths', - required=True, - type=str, - nargs='+', - help='Specify a list of feature-set file paths') + parser.add_argument( + "--fs_paths", + required=True, + type=str, + nargs="+", + help="Specify a list of feature-set file paths", + ) - parser.add_argument('--fs_parts', - required=True, - type=int, - nargs='+', - help='Specify a list of partition counts') + parser.add_argument( + "--fs_parts", + required=True, + type=int, + nargs="+", + help="Specify a list of partition counts", + ) - parser.add_argument('--first_parts', - required=False, - type=int, - nargs='+', - help='Optionally, specify a list of first pass partition counts') + parser.add_argument( + "--first_parts", + required=False, + type=int, + nargs="+", + help="Optionally, specify a list of first pass partition counts", + ) - # misc + # misc - parser.add_argument('--maxdepth', - type=int, - default=0, - help='Apply a constraint to the plan tree depth') + parser.add_argument( + "--maxdepth", + type=int, + default=0, + help="Apply a constraint to the plan tree depth", + ) - parser.add_argument('--verbose', - action='store_true', - help='Verbosity') + parser.add_argument("--verbose", action="store_true", help="Verbosity") - parser.add_argument('--debug', - action='store_true', - help='Show complete plan tree structure') + parser.add_argument("--debug", + action="store_true", + help="Show complete plan tree structure") - parser.add_argument('--print_tree', - action='store_true', - help='Dump the complete plan tree - potentially lengthy!') + parser.add_argument( + "--print_tree", + action="store_true", + help="Dump the complete plan tree - potentially lengthy!", + ) - parser.add_argument('--test', - action='store_true', - help='Test plan navigation and entry retrieval') + parser.add_argument("--test", + action="store_true", + help="Test plan navigation and entry retrieval") - args= parser.parse_args() + args = parser.parse_args() return args diff --git a/workflows/cp-leaveout/py/plangen.py b/workflows/cp-leaveout/py/plangen.py index bb440cde..50e1a346 100644 --- a/workflows/cp-leaveout/py/plangen.py +++ b/workflows/cp-leaveout/py/plangen.py @@ -1,31 +1,33 @@ - -from collections import deque -from collections import namedtuple -from enum import Enum import glob import itertools as it import json -import numpy as np import os -import sys import sqlite3 +import sys +import time +import traceback +from abc import ABC, abstractmethod # abstract class support +from collections import OrderedDict, deque, namedtuple +from datetime import datetime +from enum import Enum +from pprint import pprint as pp from sqlite3 import Error as db_Error +import numpy as np import planargs - -from abc import ABC, abstractmethod # abstract class support -from collections import OrderedDict from scipy.special import comb -from pprint import pprint as pp -from datetime import datetime -ISO_TIMESTAMP = "seconds" # timestamp to ISO string -ISO_TIMESTAMP_ENCODE = "%Y-%m-%dT%H:%M:%S" # ISO string to timestamp -DEBUG_SQL = False +ISO_TIMESTAMP = "seconds" # timestamp to ISO string +ISO_TIMESTAMP_ENCODE = "%Y-%m-%dT%H:%M:%S" # ISO string to timestamp +DEBUG_SQL = False # True + +conn = None +csr = None + def isempty(path): """Determine whether the given directory is empty.""" - flist = glob.glob(os.path.join(path,'*')) + flist = glob.glob(os.path.join(path, "*")) return flist == [] @@ -83,7 +85,9 @@ def validate_args(args): reqd_lengths = [nbr_feature_sets] * 4 if test_lengths != reqd_lengths: - sys.exit("Error: The lengths of all feature set definition args (fs_<>) must be identical") + sys.exit( + "Error: The lengths of all feature set definition args (fs_<>) must be identical" + ) if nbr_feature_sets <= 1: sys.exit("Error: Partitioning requires multiple feature sets") @@ -96,23 +100,29 @@ def validate_args(args): # validate input and output directories if args.in_dir and not os.path.isdir(args.in_dir): - sys.exit("Error: --in_dir must designate a directory, '%s' is not valid" % args.in_dir) + sys.exit( + "Error: --in_dir must designate a directory, '%s' is not valid" % + args.in_dir) if not os.path.isdir(args.out_dir): - sys.exit("Error: --out_dir must designate a directory, '%s' is not valid" % args.out_dir) + sys.exit( + "Error: --out_dir must designate a directory, '%s' is not valid" % + args.out_dir) if not args.overwrite and not isempty(args.out_dir): - sys.exit("Error: --out_dir '%s' is not empty, --overwrite not specified" % args.out_dir) + sys.exit( + "Error: --out_dir '%s' is not empty, --overwrite not specified" % + args.out_dir) if verbose: print("Writing plan information to %s" % os.path.abspath(args.out_dir)) - # expand, validate and load input feature-set content lists + # expand, validate and load input feature-set content lists fs_content = [] args.fs_lines = [] file_error = False if args.in_dir == None: - args.in_dir = '' # prepare for use in os.path.join() + args.in_dir = "" # prepare for use in os.path.join() for i, path in enumerate(args.fs_paths): fullpath = os.path.join(args.in_dir, path) @@ -120,23 +130,23 @@ def validate_args(args): file_error = True print("Error: %s file not found" % fullpath) else: - with open(fullpath, 'r') as f: # read text and sanitize + with open(fullpath, "r") as f: # read text and sanitize raw_lines = f.readlines() text = [line.strip() for line in raw_lines] - text = [l for l in text if l != ''] + text = [l for l in text if l != ""] fs_content.append(text) args.fs_lines.append(len(text)) if verbose: - print("Loading '%s' feature set definition from %s - %d lines" - % (args.fs_names[i], fullpath, len(text))) + print("Loading '%s' feature set definition from %s - %d lines" % + (args.fs_names[i], fullpath, len(text))) if file_error: sys.exit("Terminating due to error") # construct a partitioning object exporting a partion() function - if args.partition_strategy == 'leaveout': + if args.partition_strategy == "leaveout": generator = LeaveoutSubsetGenerator() # return feature-set contents lists @@ -155,47 +165,40 @@ class SubsetGenerator(ABC): partitioning schemes. Subclasses should implement their specializations. """ - def __init__(self, name=''): + def __init__(self, name=""): self.name = name self.term_msg = "Terminating due to error" @abstractmethod - def partition( - self, - base, - size=None, - count=None, - name='-unspecified-' - ): + def partition(self, base, size=None, count=None, name="-unspecified-"): """Partition a feature-set array. - Partition the 'base', a list of elements, using the abstract arguments - 'size' and 'count' to tailor the implementation's algorithm. 'name' is - used in error reporting and is optional. + Partition the 'base', a list of elements, using the abstract + arguments 'size' and 'count' to tailor the implementation's + algorithm. 'name' is used in error reporting and is optional. """ validate(self, base, size, count, name) return [] def get_plan_label(self, plan_dict, root_name): root = plan_dict[root_name] - return root['label'] + return root["label"] - def _validation_error(self, base_len, size, count, name='-unspecified-'): - """Provide a common error reporting function. """ + def _validation_error(self, base_len, size, count, name="-unspecified-"): + """Provide a common error reporting function.""" print("Base list length: %d requested %d sublists of length %d" % - (base_len, count, size)) + (base_len, count, size)) - def validate(self, base, size=None, count=None, name='-unspecified-'): + def validate(self, base, size=None, count=None, name="-unspecified-"): """Provide basic request validation, specific generators may impose - additional requirements. - """ + additional requirements.""" berror = False base_len = len(base) if size == None or size <= 0 or size > base_len: berror = True else: - unique_combos = comb(base_len, size) # implements N take K + unique_combos = comb(base_len, size) # implements N take K if count > unique_combos: berror = True if berror: @@ -203,22 +206,27 @@ def validate(self, base, size=None, count=None, name='-unspecified-'): return not berror + # # UNDER EVALUATION ????????????????????????????????????????????????????? # + class IterativeSubsetGenerator(SubsetGenerator): - """ Tom Brettin method... subset generation via iteration over base""" + """Tom Brettin method... + + subset generation via iteration over base + """ + def __init__(self): - SubsetGenerator.__init__(self, 'IterativeSubsetGenerator') + SubsetGenerator.__init__(self, "IterativeSubsetGenerator") def partition(self, base, size=None, count=0, name=None): - """ """ + """""" if size is None: print("Error: Unspecified list partitioning size") sys.exit(3) - """ base_len = len(base) if count == 0: # a simplification useful in the iterative approach @@ -240,18 +248,19 @@ def partition(self, base, size=None, count=0, name=None): omit_size = base_len - size increment = min(size, omit_size) - # omit consecutive blocks of feature-name entries + # omit consecutive blocks of feature-name entries for i in range(count): org = i * increment if org >= base_len: org = org % base_len if org == 0 and i > 0: - print("Warning: %d sublists of %s completed short of the requested %d" + print( + "Warning: %d sublists of %s completed short of the requested %d" % (i, name, count)) break end = org + size - sublist = np_base.take(range(org, end), mode='wrap') + sublist = np_base.take(range(org, end), mode="wrap") print(sublist) selected_sublists.append(sublist) @@ -261,43 +270,52 @@ def partition(self, base, size=None, count=0, name=None): class LeaveoutSubsetGenerator(SubsetGenerator): """CANDLE milestone 13 style feature set partitioning. - All SubsetGenerator subclasses are required to implement partition(), - plan_init() and plan_term() functions. + All SubsetGenerator subclasses are required to implement + partition(), plan_init() and plan_term() functions. """ def __init__(self): - SubsetGenerator.__init__(self, 'LeaveoutSubsetGenerator') + SubsetGenerator.__init__(self, "LeaveoutSubsetGenerator") self.strategy = "leaveout" - def plan_init(self, fs_names, fs_paths, fs_lines, fs_parts, maxdepth, root_name='1'): - """Initialize - collect plan metadata """ + def plan_init(self, + fs_names, + fs_paths, + fs_lines, + fs_parts, + maxdepth, + root_name="1"): + """Initialize - collect plan metadata""" currtime = datetime.now() - details = {'fs_names': fs_names, 'fs_filepaths':fs_paths, 'fs_parts': fs_parts} - details['create_date'] = currtime.isoformat(timespec=ISO_TIMESTAMP) - details['strategy'] = self.strategy - - label = '' + details = { + "fs_names": fs_names, + "fs_filepaths": fs_paths, + "fs_parts": fs_parts + } + details["create_date"] = currtime.isoformat(timespec=ISO_TIMESTAMP) + details["strategy"] = self.strategy + + label = "" for i in range(len(fs_names)): if i != 0: - label += '_' - s = '{}{}-p{}'.format(fs_names[i], fs_lines[i], fs_parts[i]) + label += "_" + s = "{}{}-p{}".format(fs_names[i], fs_lines[i], fs_parts[i]) label += s if maxdepth > 0: - label += '-maxdepth{}'.format(maxdepth) + label += "-maxdepth{}".format(maxdepth) - details['label'] = label + details["label"] = label plan_dict = OrderedDict() plan_dict[root_name] = details return root_name, plan_dict def plan_term(self, plan_dict, root_name, nbr_subplans): - """Completion - post plan summary metadata """ + """Completion - post plan summary metadata""" meta = plan_dict[root_name] - meta['nbr_subplans'] = nbr_subplans + meta["nbr_subplans"] = nbr_subplans - - def partition(self, base, size='n/a', count=None, name=None): + def partition(self, base, size="n/a", count=None, name=None): """Partition a feature-set list into lists of equal sized elements. This partitioner accepts a list of feature-set names and returns @@ -329,7 +347,7 @@ def partition(self, base, size='n/a', count=None, name=None): """ base_len = len(base) - if base_len < count: # can partition any further? + if base_len < count: # can partition any further? return [[feature] for feature in base] size = base_len // count @@ -346,19 +364,23 @@ def partition(self, base, size='n/a', count=None, name=None): return sublists -#------------------------------------------------------------------------------ + +# ------------------------------------------------------------------------------ # Database support, table and column definitions, DDL and DML # Refer to the plan_prep() function for a discussion of the "planstat" and # "runhist" tables defined below. -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ + class RunType(Enum): RUN_ALL = 0 RESTART = 1 -class RunStat(Enum): # subplan execution status - SCHEDULED = 'scheduled' - COMPLETE = 'complete' + +class RunStat(Enum): # subplan execution status + SCHEDULED = "scheduled" + COMPLETE = "complete" + # planstat table, rows are returned via the PlanstatRow namedtuple @@ -371,19 +393,16 @@ class RunStat(Enum): # subplan execution status nbr_subplans INTEGER ); """ -PlanstatRow = namedtuple('PlanstatRow', +PlanstatRow = namedtuple( + "PlanstatRow", [ - 'rowid', - 'plan_name', - 'create_date', - 'feature_sets', - 'partitions', - 'nbr_subplans' - ] + "rowid", "plan_name", "create_date", "feature_sets", "partitions", + "nbr_subplans" + ], ) _select_row_from_planstat = """ - SELECT rowid, + SELECT rowid, plan_name, create_date, feature_sets, partitions, nbr_subplans FROM planstat WHERE plan_name='{}' @@ -420,23 +439,24 @@ class RunStat(Enum): # subplan execution status PRIMARY KEY (plan_id, subplan_id) ); """ -RunhistRow = namedtuple('RunhistRow', +RunhistRow = namedtuple( + "RunhistRow", [ - 'plan_id', - 'subplan_id', - 'status', - 'start_time', - 'stop_time', - 'run_mins', - 'loss', - 'mae', - 'r2', - 'val_loss', - 'val_mae', - 'val_r2', - 'lr', - 'other_info' - ] + "plan_id", + "subplan_id", + "status", + "start_time", + "stop_time", + "run_mins", + "loss", + "mae", + "r2", + "val_loss", + "val_mae", + "val_r2", + "lr", + "other_info", + ], ) _select_row_from_runhist = """ @@ -480,17 +500,35 @@ class RunStat(Enum): # subplan execution status DELETE FROM runhist where plan_id = {} """ -#------------------------------------------------------------------------------ + +# def log(msg): +# if DEBUG_SQL: +# with open("plangen_db.log", "a") as fp: +# fp.write(msg + "\n") +# fp.flush() + +from log_tools import * + + +logger = get_logger(logger, "PLANGEN", milliseconds=True) + + +def log(msg): + logger.debug(msg) + + +# ------------------------------------------------------------------------------ # "Plan management" Database functions # -# db_connect - establish database connection returning conn handle -# execute_sql_stmt - execute a SQL statement with optional error trap +# db_connect - establish database connection returning conn handle +# execute_sql_stmt - execute a SQL statement with optional error trap # plan_prep - prepare for the execution of a multi-step "plan" -# start_subplan - start a subplan, (ex. '1.4.8'), write RunhistRow -# stop_subplan - stop a subplan, update RunhistRow -# get_subplan_runhist - return a RunhistRow for a given subplan +# start_subplan - start a subplan, (ex. '1.4.8'), write RunhistRow +# stop_subplan - stop a subplan, update RunhistRow +# get_subplan_runhist - return a RunhistRow for a given subplan # plan_remove - remove all database records for the named plan -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ + def execute_sql_stmt(conn, stmt, cursor=None, trap_exception=False): """Execute a SQL statement. @@ -520,17 +558,19 @@ def execute_sql_stmt(conn, stmt, cursor=None, trap_exception=False): else: lclcsr = conn.cursor() try: - if DEBUG_SQL: - with open("plangen_db.log", "a") as fp: - fp.write("STMT: " + stmt + "\n") - db_exception = False + # log("STMT: " + stmt) lclcsr.execute(stmt) except db_Error as e: db_exception = True - print('execute_sql_stmt:', stmt) - print('execute_sql_stmt:', e) + print("execute_sql_stmt: caught exception") + print("execute_sql_stmt:", stmt) + print("execute_sql_stmt:", e) + info = sys.exc_info() + s = traceback.format_tb(info[2]) + print("PLANGEN TRACEBACK:\n" + str(e) + " ... \n" + "".join(s)) + sys.stdout.flush() if not trap_exception: raise finally: @@ -549,7 +589,7 @@ def db_connect(db_path): """Connect to the plan management database. Establish a connection to the sqlite3 database contained in the named file. - A plan management database is created and populated at db_path if the file + A plan management database is created and populated at db_path if the file does not exist. Args @@ -559,7 +599,7 @@ def db_connect(db_path): A connection handle is returned upon success, else None """ - if db_path == ':memory:' or not os.path.exists(db_path): + if db_path == ":memory:" or not os.path.exists(db_path): prev_allocated = False else: prev_allocated = True @@ -567,12 +607,12 @@ def db_connect(db_path): try: conn = sqlite3.connect(db_path) except db_Error as error: - print('db_connect', error) + print("db_connect", error) raise - # create plan management tables on initial database allocation + # create plan management tables on initial database allocation if conn and not prev_allocated: - complete = execute_sql_stmt(conn, _planstat_ddl) + complete = execute_sql_stmt(conn, _planstat_ddl) complete &= execute_sql_stmt(conn, _runhist_ddl) if complete: @@ -599,10 +639,10 @@ def plan_remove(db_path, plan_path): conn = db_connect(db_path) plan_key = _get_planstat_key(plan_path) stmt = _select_row_from_planstat.format(plan_key) - csr = conn.cursor() + csr = conn.cursor() execute_sql_stmt(conn, stmt, cursor=csr) nrow = csr.rowcount - row = csr.fetchone() + row = csr.fetchone() print("%d run history rows deleted" % nrow) @@ -610,8 +650,8 @@ def plan_remove(db_path, plan_path): print("Error: CLEANUP request failed - %s has not been run" % plan_key) status = -1 else: - plan_rec = PlanstatRow._make(row) # column-name addressable - rowid = plan_rec.rowid # the unique rowid is the plan uniquifier + plan_rec = PlanstatRow._make(row) # column-name addressable + rowid = plan_rec.rowid # the unique rowid is the plan uniquifier _delete_runhistory(conn, rowid) stmt = _delete_planstat_plan.format(rowid) status = execute_sql_stmt(conn, stmt) @@ -662,58 +702,60 @@ def plan_prep(db_path, plan_path, run_type=RunType.RUN_ALL): """ # load the plan and retrieve identity info - plan_dict = load_plan(plan_path) - create_date = get_plan_create_date(plan_dict) + plan_dict = load_plan(plan_path) + create_date = get_plan_create_date(plan_dict) feature_sets = get_plan_fs_names(plan_dict) - partitions = get_plan_fs_parts(plan_dict) - nbr_subplans = get_plan_nbr_subplans(plan_dict) + partitions = get_plan_fs_parts(plan_dict) + nbr_subplans = get_plan_nbr_subplans(plan_dict) - # determine if a plan of the given name has already been registered + # determine if a plan of the given name has already been registered conn = db_connect(db_path) plan_key = _get_planstat_key(plan_path) stmt = _select_row_from_planstat.format(plan_key) - csr = conn.cursor() + csr = conn.cursor() execute_sql_stmt(conn, stmt, cursor=csr) - row = csr.fetchone() + row = csr.fetchone() if not row: rowid = -1 else: - plan_rec = PlanstatRow._make(row) # column-name addressable - rowid = plan_rec.rowid # the unique rowid will be the uniquifier returned + plan_rec = PlanstatRow._make(row) # column-name addressable + rowid = plan_rec.rowid # the unique rowid will be the uniquifier returned - # compare run_type to initial expectations + # compare run_type to initial expectations error = False if run_type == RunType.RUN_ALL and rowid > 0: - print("Error: RUN_ALL specified but plan: %s has already been defined" % plan_key) + print("Error: RUN_ALL specified but plan: %s has already been defined" % + plan_key) error = True elif run_type == RunType.RESTART and rowid < 0: - print("Warning: RESTART specified but plan: %s has not been previously run" % plan_key) - - elif rowid > 0 and create_date != create_date: # DEBUG ???????????????????????????????????? plan_rec.create_date: - print("Error: RESTART specified but the signature of the previously defined plan: %s does not match" % plan_key) + print( + "Warning: RESTART specified but plan: %s has not been previously run" + % plan_key) + + elif (rowid > 0 and create_date != create_date + ): # DEBUG ???????????????????????????????????? plan_rec.create_date: + print( + "Error: RESTART specified but the signature of the previously defined plan: %s does not match" + % plan_key) error = True # register new plans acquiring the uniquifying plan_id used to compose runhistory table keys if not error and rowid < 0: feature_sets = str(feature_sets) - feature_sets = feature_sets.replace("'", "") # create string literal from list of str - partitions = str(partitions) # create string literal from list of int - - stmt = _insert_planstat_plan.format( - plan_key, - create_date, - feature_sets, - partitions, - nbr_subplans - ) + feature_sets = feature_sets.replace( + "'", "") # create string literal from list of str + partitions = str(partitions) # create string literal from list of int + + stmt = _insert_planstat_plan.format(plan_key, create_date, feature_sets, + partitions, nbr_subplans) status = execute_sql_stmt(conn, stmt, cursor=csr) rowid = csr.lastrowid - # cleanup resources and return uniquifier or error indicator + # cleanup resources and return uniquifier or error indicator csr.close() conn.commit() @@ -723,7 +765,11 @@ def plan_prep(db_path, plan_path, run_type=RunType.RUN_ALL): return rowid -def start_subplan(db_path, plan_path, plan_id=None, subplan_id=None, run_type=None): +def start_subplan(db_path, + plan_path, + plan_id=None, + subplan_id=None, + run_type=None): """Schedule the execution of a subplan. This function writes a RunhistRow record to the runhist table indicating that @@ -743,44 +789,67 @@ def start_subplan(db_path, plan_path, plan_id=None, subplan_id=None, run_type=No the subplan. -1 is returned from a RESTART call if the a RunhistRow already exists for the plan/subplan and is marked COMPLETE. """ - - conn = db_connect(db_path) - csr = conn.cursor() + global conn, csr + start = time.time() + log("start_subplan: subplan_id=%s" % subplan_id) + # sys.stdout.flush() + if conn is None: + conn = db_connect(db_path) + csr = conn.cursor() + # conn.execute('PRAGMA journal_mode = WAL') + conn.execute('PRAGMA synchronous = OFF') skip = False + log("start_subplan: run_type: '%s'" % str(run_type)) + # log("plangen: start_subplan: run_type type: %s" % str(type(run_type))) + log("start_subplan: base: '%s'" % str(RunType.RESTART)) + # sys.stdout.flush() + # skip previously completed work if RESTART - if run_type == RunType.RESTART: + if "RESTART" in str(run_type): + log("start_subplan: checking restart: %i" % plan_id) + # sys.stdout.flush() stmt = _select_row_from_runhist.format(plan_id, subplan_id) execute_sql_stmt(conn, stmt, cursor=csr) row = csr.fetchone() if row: + log("start_subplan: found row.") runhist_rec = RunhistRow._make(row) + log("start_subplan: found '%s'" % runhist_rec.status) if runhist_rec.status == RunStat.COMPLETE.name: skip = True + # log("start_subplan: skip %r" % skip) + else: + log("start_subplan: not checking restart") + # sys.stdout.flush() - # construct/reinit a new runhist record + # construct/reinit a new runhist record if not skip: currtime = datetime.now() start_time = currtime.isoformat(timespec=ISO_TIMESTAMP) - stmt = _insupd_scheduled_runhist.format( - plan_id, - subplan_id, - RunStat.SCHEDULED.name, - start_time - ) + stmt = _insupd_scheduled_runhist.format(plan_id, subplan_id, + RunStat.SCHEDULED.name, + start_time) execute_sql_stmt(conn, stmt, cursor=csr) - csr.close() + # csr.close() conn.commit() - conn.close() + # conn.close() if skip: - return -1 + token = "SKIP" + result = -1 else: - return 0 + token = "RUN" + result = 0 + + log("start_subplan: subplan_id=%s: %s" % (subplan_id, result)) + stop = time.time() + log("start_subplan: time: %0.3f" % (stop - start)) + return result def stop_subplan(db_path, plan_id=None, subplan_id=None, comp_info_dict={}): @@ -802,21 +871,23 @@ def stop_subplan(db_path, plan_id=None, subplan_id=None, comp_info_dict={}): """ conn = db_connect(db_path) - csr = conn.cursor() - curr_time = datetime.now() + csr = conn.cursor() + curr_time = datetime.now() stop_time = curr_time.isoformat(timespec=ISO_TIMESTAMP) - comp_dict = dict( - loss=0.0, mae=0.0, r2=0.0, - val_loss=0.0, val_mae=0.0, val_r2=0.0, - lr=0.0 - ) + comp_dict = dict(loss=0.0, + mae=0.0, + r2=0.0, + val_loss=0.0, + val_mae=0.0, + val_r2=0.0, + lr=0.0) comp_info_dict = extract_history(comp_info_dict) remainder = _acquire_actuals(comp_dict, comp_info_dict) if len(remainder) == 0: - other_info = '' + other_info = "" else: other_info = json.dumps(remainder) @@ -825,30 +896,31 @@ def stop_subplan(db_path, plan_id=None, subplan_id=None, comp_info_dict={}): execute_sql_stmt(conn, stmt, csr) row = csr.fetchone() - if row: # expected, caller error if already marked COMPLETED + if row: # expected, caller error if already marked COMPLETED runhist_rec = RunhistRow._make(row) if runhist_rec.status != RunStat.COMPLETE.name: - start_time = datetime.strptime(runhist_rec.start_time, ISO_TIMESTAMP_ENCODE) - duration = curr_time - start_time - run_mins = int((duration.total_seconds() + 59) / 60) + start_time = datetime.strptime(runhist_rec.start_time, + ISO_TIMESTAMP_ENCODE) + duration = curr_time - start_time + run_mins = int((duration.total_seconds() + 59) / 60) # update runhist record stmt = _insupd_completed_runhist.format( - # column values + # column values RunStat.COMPLETE.name, stop_time, run_mins, - comp_dict['loss'], - comp_dict['mae'], - comp_dict['r2'], - comp_dict['val_loss'], - comp_dict['val_mae'], - comp_dict['val_r2'], - comp_dict['lr'], + comp_dict["loss"], + comp_dict["mae"], + comp_dict["r2"], + comp_dict["val_loss"], + comp_dict["val_mae"], + comp_dict["val_r2"], + comp_dict["lr"], other_info, - # key spec + # key spec plan_id, - subplan_id + subplan_id, ) execute_sql_stmt(conn, stmt) @@ -900,7 +972,7 @@ def get_subplan_runhist(db_path, plan_id=None, subplan_id=None): """ conn = db_connect(db_path) stmt = _select_row_from_runhist.format(plan_id, subplan_id) - csr = conn.cursor() + csr = conn.cursor() execute_sql_stmt(conn, stmt, csr) row = csr.fetchone() @@ -920,19 +992,19 @@ def _acquire_actuals(dft_dict, actuals_dict): dft_dict[key] = actuals[key] actuals.pop(key) - return actuals # possibly empty + return actuals # possibly empty def _get_planstat_key(plan_path): """Extract the name portion of a plan from a filepath.""" basename = os.path.basename(plan_path) - basepfx = basename.split(sep='.') + basepfx = basename.split(sep=".") return basepfx[0] def _delete_runhistory(conn, plan_id): """Delete RunhistRows containing the given plan_id.""" - csr = conn.cursor() + csr = conn.cursor() stmt = _delete_from_runhistory.format(plan_id) execute_sql_stmt(conn, stmt, cursor=csr, trap_exception=True) rowcount = csr.rowcount @@ -941,9 +1013,10 @@ def _delete_runhistory(conn, plan_id): return rowcount -#------------------------------------------------------------------------------ -# Plan navigation, content retrieval -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ +# Plan navigation, content retrieval +# ------------------------------------------------------------------------------ + def load_plan(filepath): """Load a JSON transfer learning plan. @@ -959,30 +1032,36 @@ def load_plan(filepath): An entry-ordered plan in OrderedDict format is returned. """ - with open(filepath, 'r') as f: + with open(filepath, "r") as f: ordered_plan_dict = json.load(f, object_pairs_hook=OrderedDict) return ordered_plan_dict + def get_plan_create_date(plan_dict): _, value = _get_first_entry(plan_dict) - return value['create_date'] + return value["create_date"] + def get_plan_fs_names(plan_dict): _, value = _get_first_entry(plan_dict) - return value['fs_names'] + return value["fs_names"] + def get_plan_fs_parts(plan_dict): _, value = _get_first_entry(plan_dict) - return value['fs_parts'] + return value["fs_parts"] + def get_plan_nbr_subplans(plan_dict): _, value = _get_first_entry(plan_dict) - return value['nbr_subplans'] + return value["nbr_subplans"] + def _get_first_entry(ordered_dict): key, value = next(iter(ordered_dict.items())) return key, value + def get_subplan(plan_dict, subplan_id=None): """Retrieve the content of a named subplan or the root plan. @@ -995,10 +1074,12 @@ def get_subplan(plan_dict, subplan_id=None): A (content, subplan_id) pair is returned. The returned name is useful when using default arguments to retrieve the root plan. """ - if subplan_id is None: subplan_id, content = _get_first_entry(plan_dict) else: + # print("get_subplan dump:") + # json.dump(plan_dict, sys.stdout, indent=2) + # print("keys: %i" % len(plan_dict.keys())) content = plan_dict.get(subplan_id) return content, subplan_id @@ -1018,12 +1099,12 @@ def get_predecessor(plan_dict, subplan_id): is specified None is returned. """ - segments = subplan_id.split(sep='.') + segments = subplan_id.split(sep=".") if len(segments) <= 1: subplan_id = None else: segments.pop() - subplan_id = '.'.join(segments) + subplan_id = ".".join(segments) return subplan_id @@ -1043,7 +1124,7 @@ def get_successors(plan_dict, subplan_id): """ successor_names = [] for i in it.count(start=1): - new_name = subplan_id + '.' + str(i) + new_name = subplan_id + "." + str(i) value = plan_dict.get(new_name) if not value: break @@ -1052,12 +1133,17 @@ def get_successors(plan_dict, subplan_id): return successor_names -def _get_named_set(plan_dict, subplan_id, section_tag, fs_name, collector, parent_features=None): - """ """ +def _get_named_set(plan_dict, + subplan_id, + section_tag, + fs_name, + collector, + parent_features=None): + """""" while True: content, _ = get_subplan(plan_dict, subplan_id) - assert(content) + assert content section = content[section_tag] for i, section_features in enumerate(section): @@ -1111,42 +1197,44 @@ def get_subplan_features(plan_dict, subplan_id, parent_features=False): """ # acquire feature_set names populated in the plan + print("get_subplan_features(): " + subplan_id) content, _ = get_subplan(plan_dict, subplan_id) - if not content: - return None, None + if content is None: + print("get_subplan() found no content!") + return None, None, None, None # peek inside the training set to capture active feature-set names - train_set = content['train'][0] + train_set = content["train"][0] fs_names = [name for name in train_set.keys()] - # categorize the results + # categorize the results result = {} result[0] = fs_names - result['train'] = {} - result['val'] = {} + result["train"] = {} + result["val"] = {} - for set_name, pf in [('train', True), ('val', False)]: + for set_name, pf in [("train", True), ("val", False)]: if pf == True: pf = parent_features for fs_name in fs_names: collector = [] - _get_named_set( - plan_dict, - subplan_id, - set_name, - fs_name, - collector, - parent_features=pf - ) + _get_named_set(plan_dict, + subplan_id, + set_name, + fs_name, + collector, + parent_features=pf) result[set_name][fs_name] = collector - return result, result[0], result['train'], result['val'] + return result, result[0], result["train"], result["val"] + + +# ------------------------------------------------------------------------------ +# Plan construction +# ------------------------------------------------------------------------------ -#------------------------------------------------------------------------------ -# Plan construction -#------------------------------------------------------------------------------ def build_dictionary_from_lists(seq_list, names): """Create a dictionary with 'names' as labels and 'seq_list' values.""" @@ -1156,7 +1244,12 @@ def build_dictionary_from_lists(seq_list, names): return dict -def build_plan_tree(args, feature_set_content, parent_plan_id='', depth=0, data_pfx='', plan_pfx=''): +def build_plan_tree(args, + feature_set_content, + parent_plan_id="", + depth=0, + data_pfx="", + plan_pfx=""): """Generate a plan supporting training, transfer-learning, resume-training. ADD GENERAL DOC @@ -1195,9 +1288,9 @@ def build_plan_tree(args, feature_set_content, parent_plan_id='', depth=0, data_ all_parts = [] - #flat_partitions = [] # preserve, used for file-based approach - #files = [] # preserve, used for file-based approach - #sequence = 0 # preserve, used for file-based approach + # flat_partitions = [] # preserve, used for file-based approach + # files = [] # preserve, used for file-based approach + # sequence = 0 # preserve, used for file-based approach xxx = False for i in range(len(args.fs_names)): @@ -1206,7 +1299,8 @@ def build_plan_tree(args, feature_set_content, parent_plan_id='', depth=0, data_ if depth == 0: count = args.first_parts[i] feature_set_name = args.fs_names[i] - partitions = args.generator.partition(feature_set_content[i], count=count) + partitions = args.generator.partition(feature_set_content[i], + count=count) all_parts.append(partitions) # acquire a cross-product of all feature-set partitions @@ -1227,20 +1321,27 @@ def build_plan_tree(args, feature_set_content, parent_plan_id='', depth=0, data_ else: train.append(section) - # generate next depth/level (successor) plans - curr_plan_id = '{}.{}'.format(parent_plan_id, step + 1) - args.plan_dict[curr_plan_id] = {'val': val, 'train': train} - data_name = '{}.{}'.format(data_pfx, step + 1) - plan_name = '{}.{}'.format(plan_pfx, step + 1) + # generate next depth/level (successor) plans + curr_plan_id = "{}.{}".format(parent_plan_id, step + 1) + args.plan_dict[curr_plan_id] = {"val": val, "train": train} + data_name = "{}.{}".format(data_pfx, step + 1) + plan_name = "{}.{}".format(plan_pfx, step + 1) - # depth-first, shorthand representation of tree showing first feature names + # depth-first, shorthand representation of tree showing first feature names if args.debug: - indent = ' ' * (depth * 4) + indent = " " * (depth * 4) print(indent, curr_plan_id) - indent += ' ' * 4 + indent += " " * 4 fs = parts_xprod[step] for i in range(len(fs)): - print(indent, args.fs_names[i], 'count:', len(fs[i]), 'first:', fs[i][0]) + print( + indent, + args.fs_names[i], + "count:", + len(fs[i]), + "first:", + fs[i][0], + ) substeps += build_plan_tree( args, @@ -1248,12 +1349,11 @@ def build_plan_tree(args, feature_set_content, parent_plan_id='', depth=0, data_ parent_plan_id=curr_plan_id, depth=curr_depth, data_pfx=data_name, - plan_pfx=plan_name + plan_pfx=plan_name, ) steps += substeps return steps - """ # THIS IS A WORK-IN-PROGRESS ... GENERATING FILES FOR DATA AND PLAN @@ -1317,50 +1417,52 @@ def build_plan_tree(args, feature_set_content, parent_plan_id='', depth=0, data_ return """ + def write_file(fname, title, string_list): """Write text expressed as an array of lines to file.""" - with open(fname, 'w') as f: + with open(fname, "w") as f: for line in string_list: f.write(line) + def write_dict_to_json(dictionary, fname): """Write dictionary to a json file.""" - with open(fname, 'w') as f: + with open(fname, "w") as f: json.dump(dictionary, f) -#---------------------------------------------------------------------------------- + +# ---------------------------------------------------------------------------------- # various hard-coded lists, test cases - the synthetic feature-sets remain useful -#---------------------------------------------------------------------------------- +# ---------------------------------------------------------------------------------- + +# synthetic_cell_names = ['cell_' + '%04d' % (x) for x in range(1000)] +# synthetic_drug_names = ['drug_' + '%04d' % (x) for x in range(1000)] -""" -synthetic_cell_names = ['cell_' + '%04d' % (x) for x in range(1000)] -synthetic_drug_names = ['drug_' + '%04d' % (x) for x in range(1000)] -""" +# ---------------------------------------------------------------------------------- +# mainline +# ---------------------------------------------------------------------------------- -#---------------------------------------------------------------------------------- -# mainline -#---------------------------------------------------------------------------------- def main(): # Acquire and validate arguments args = planargs.parse_arguments() - args.json = True # the only available option thus far + args.json = True # the only available option thus far generator, feature_set_content = validate_args(args) args.generator = generator root_name, args.plan_dict = generator.plan_init( - fs_names = args.fs_names, # validated cmdline arg - fs_paths = args.fs_paths, # validated cmdline arg - fs_lines = args.fs_lines, # created by validate_args - fs_parts = args.fs_parts, # validated cmdline arg - maxdepth = args.maxdepth + fs_names=args.fs_names, # validated cmdline arg + fs_paths=args.fs_paths, # validated cmdline arg + fs_lines=args.fs_lines, # created by validate_args + fs_parts=args.fs_parts, # validated cmdline arg + maxdepth=args.maxdepth, ) - # feature_set_content = [cell_names, drug_names] + # feature_set_content = [cell_names, drug_names] # feature_set_content = [synthetic_cell_names, synthetic_drug_names] - # remove by-1 dimensions, they do not need to be represented in the plan explicitly + # remove by-1 dimensions, they do not need to be represented in the plan explicitly while True: try: ndx = args.fs_parts.index(1) @@ -1371,24 +1473,27 @@ def main(): except ValueError: break - # Plan generation - data_fname_pfx = os.path.join(args.out_dir, 'DATA.1') - plan_fname_pfx = os.path.join(args.out_dir, 'PLAN.1') + # Plan generation + data_fname_pfx = os.path.join(args.out_dir, "DATA.1") + plan_fname_pfx = os.path.join(args.out_dir, "PLAN.1") steps = build_plan_tree( - args, # command line argument namespace - feature_set_content, # for example [[cell1 ... celln] [drug1 ... drugn]] - parent_plan_id=root_name, # name of root plan, subplan names created from this stem - data_pfx=data_fname_pfx, # DATA file prefix, building block for feature name files - plan_pfx=plan_fname_pfx # PLAN file prefix, building block for plan name files + args, # command line argument namespace + feature_set_content, # for example [[cell1 ... celln] [drug1 ... drugn]] + parent_plan_id= + root_name, # name of root plan, subplan names created from this stem + data_pfx= + data_fname_pfx, # DATA file prefix, building block for feature name files + plan_pfx= + plan_fname_pfx, # PLAN file prefix, building block for plan name files ) generator.plan_term(args.plan_dict, root_name, steps) - print("Plan generation complete, total steps: %d" % steps) + print("Plan generation complete, total steps: %d" % steps) if args.json: label = args.generator.get_plan_label(args.plan_dict, root_name) - qualified_name = 'plangen_' + label + '.json' + qualified_name = "plangen_" + label + ".json" json_file_name = os.path.join(args.out_dir, qualified_name) json_abspath = os.path.abspath(json_file_name) write_dict_to_json(args.plan_dict, json_abspath) @@ -1396,22 +1501,24 @@ def main(): if args.print_tree: print("Plan dictionary generated") - pp(args.plan_dict, width=160) # DEBUG comment this out for large plans + pp(args.plan_dict, width=160) # DEBUG comment this out for large plans if args.test: # test1(json_abspath, "test1_sql.db") test2(json_abspath, "test3_sql.db") -#---------------------------------------------------------------------------------- + +# ---------------------------------------------------------------------------------- # sqlite3 API functions -#---------------------------------------------------------------------------------- +# ---------------------------------------------------------------------------------- + def test2(plan_path, db_path): - #run_type = RunType.RESTART + # run_type = RunType.RESTART run_type = RunType.RUN_ALL plan_name = os.path.basename(plan_path) - plan_id = plan_prep(db_path, plan_name, run_type) + plan_id = plan_prep(db_path, plan_name, run_type) plan_dict = load_plan(plan_path) metadata, root_name = get_subplan(plan_dict) @@ -1420,7 +1527,7 @@ def test2(plan_path, db_path): queue.append(root_name) print("Test2 start") - for iloop in it.count(start = 0): + for iloop in it.count(start=0): if len(queue) == 0: print("Test2 complete - proc loop count: %d" % iloop) break @@ -1438,7 +1545,7 @@ def test2(plan_path, db_path): plan_path, plan_id=plan_id, subplan_id=curr_subplan, - run_type=run_type + run_type=run_type, ) if status < 0: @@ -1446,12 +1553,14 @@ def test2(plan_path, db_path): continue completion_status = dict( - loss=['dont', 'want', 'this', 1.1], - mae=['nope', 2.2], + loss=["dont", "want", "this", 1.1], + mae=["nope", 2.2], r2=[3.3], - val_loss=6.6, val_mae=7.7, val_r2=8.8, + val_loss=6.6, + val_mae=7.7, + val_r2=8.8, lr=0.9, - some_new_thing='abc' + some_new_thing="abc", ) scalar_dict = extract_history(completion_status) @@ -1460,20 +1569,21 @@ def test2(plan_path, db_path): db_path, plan_id=plan_id, subplan_id=curr_subplan, - comp_info_dict=scalar_dict + comp_info_dict=scalar_dict, ) print("Completing subplan %6d" % iloop) -#---------------------------------------------------------------------------------- + +# ---------------------------------------------------------------------------------- # def test1(plan_path, db_path): run_type = RunType.RESTART - #run_type = RunType.RUN_ALL + # run_type = RunType.RUN_ALL plan_name = os.path.basename(plan_path) - plan_id = plan_prep(db_path, plan_name, run_type) + plan_id = plan_prep(db_path, plan_name, run_type) - if (plan_id < 0): + if plan_id < 0: sys.exit("Terminating due to database detected error") print("\nBegin plan navigation and subplan retrieval test") @@ -1484,11 +1594,13 @@ def test1(plan_path, db_path): # the root has no parent / predecessor parent_name = get_predecessor(plan_dict, root_name) - print("Demonstrate that root \'%s\' predecessor is not defined: %s" % (root_name, parent_name)) + print("Demonstrate that root '%s' predecessor is not defined: %s" % + (root_name, parent_name)) # the root contains metadata, it is not a run specification successor_names = get_successors(plan_dict, root_name) - print("\nThe first runable configurations are defined in %s\n" % successor_names) + print("\nThe first runable configurations are defined in %s\n" % + successor_names) # the root is the predecessor of these first level runables for sname in successor_names: @@ -1497,35 +1609,38 @@ def test1(plan_path, db_path): # run the right subtree print("\nRun the rightmost subtree \n") - for i in it.count(start = 1): + for i in it.count(start=1): listlen = len(successor_names) if listlen == 0: break for name in successor_names: - status = start_subplan( - db_path, - plan_path, - plan_id=plan_id, - subplan_id=name, - run_type=run_type - ) + status = start_subplan(db_path, + plan_path, + plan_id=plan_id, + subplan_id=name, + run_type=run_type) if status < 0: print("subplan: %s skipped, previously processed" % name) - select_one = successor_names[listlen - 1] + select_one = successor_names[listlen - 1] parent_name = get_predecessor(plan_dict, select_one) - print("%-16s is a successor of %-16s - all successors: %s" % (select_one, parent_name, successor_names)) + print("%-16s is a successor of %-16s - all successors: %s" % + (select_one, parent_name, successor_names)) # test feature lists retrieval API get_subplan_features - value,_ = get_subplan(plan_dict, select_one) + value, _ = get_subplan(plan_dict, select_one) if i < 3: for pf in [False, True]: - _, fs_name_list, train_list, val_list = get_subplan_features(plan_dict, select_one, parent_features=pf) - if False: # very verbose, use only as needed! ??????????????????????????????????????????????????????? - print("\nsubplan original:", select_one, "parent features:", pf) + _, fs_name_list, train_list, val_list = get_subplan_features( + plan_dict, select_one, parent_features=pf) + if ( + False + ): # very verbose, use only as needed! ??????????????????????????????????????????????????????? + print("\nsubplan original:", select_one, "parent features:", + pf) pp(plan_dict[select_one]) print("\nflattened TRAIN") pp(train_list) @@ -1533,25 +1648,33 @@ def test1(plan_path, db_path): pp(val_list) # test runhist retrieval api - row = get_subplan_runhist(db_path, plan_id=plan_id, subplan_id=select_one) - #print(row) + row = get_subplan_runhist(db_path, + plan_id=plan_id, + subplan_id=select_one) + # print(row) # post subplan termination - completion_status = dict(mse=1.1, mae=2.2, r_square=.555, misc='no such column', data=123) + completion_status = dict(mse=1.1, + mae=2.2, + r_square=0.555, + misc="no such column", + data=123) stop_subplan( db_path, plan_id=plan_id, subplan_id=select_one, - comp_info_dict=completion_status + comp_info_dict=completion_status, ) successor_names = get_successors(plan_dict, select_one) print("\nEnd of branch reached") + + # plan_remove(db_path, "plangen_cell8-p2_drug8-p2.json") -#---------------------------------------------------------------------------------- +# ---------------------------------------------------------------------------------- if __name__ == "__main__": main() diff --git a/workflows/cp-leaveout/py/run_chained.py b/workflows/cp-leaveout/py/run_chained.py index bd321c3b..c7f8c2fa 100644 --- a/workflows/cp-leaveout/py/run_chained.py +++ b/workflows/cp-leaveout/py/run_chained.py @@ -1,20 +1,39 @@ -import subprocess -import os +import argparse +import io import json +import os +import subprocess import sys -import io -import argparse import plangen + class Config: - - REQS = ['site', 'plan', 'submit_script', 'upf_directory', 'stages', 'stage_cfg_script', 'job_chain_arg'] - STAGE_CFG_KEYS = ['stage', 'PROCS', 'TURBINE_LAUNCH_ARGS', 'TURBINE_DIRECTIVE_ARGS', - 'WALLTIME', 'IGNORE_ERRORS', 'SH_TIMEOUT', 'BENCHMARK_TIMEOUT', - 'PPN'] - INT_KEYS = ['PROCS', 'PPN', 'BENCHMARK_TIMEOUT', 'SH_TIMEOUT', 'IGNORE_ERRORS'] - + + REQS = [ + "site", + "plan", + "submit_script", + "upf_directory", + "stages", + "stage_cfg_script", + "job_chain_arg", + ] + STAGE_CFG_KEYS = [ + "stage", + "PROCS", + "TURBINE_LAUNCH_ARGS", + "TURBINE_DIRECTIVE_ARGS", + "WALLTIME", + "IGNORE_ERRORS", + "SH_TIMEOUT", + "BENCHMARK_TIMEOUT", + "PPN", + ] + INT_KEYS = [ + "PROCS", "PPN", "BENCHMARK_TIMEOUT", "SH_TIMEOUT", "IGNORE_ERRORS" + ] + def __init__(self, cfg): self.cfg = cfg self.stage_cfgs = {} @@ -23,23 +42,28 @@ def validate(self): for r in Config.REQS: if not r in self.cfg: return (False, "Required property '{}' is missing".format(r)) - - self.cfg['stages'] = int(self.cfg['stages']) - if 'stage_cfgs' in self.cfg: - for stage_cfg in self.cfg['stage_cfgs']: - if not 'stage' in stage_cfg: - return (False, "A stage_cfg map is missing required 'stage' property") + self.cfg["stages"] = int(self.cfg["stages"]) + + if "stage_cfgs" in self.cfg: + for stage_cfg in self.cfg["stage_cfgs"]: + if not "stage" in stage_cfg: + return ( + False, + "A stage_cfg map is missing required 'stage' property", + ) for k in stage_cfg: if k not in Config.STAGE_CFG_KEYS: - return (False, "Unknow stage configuration property {}".format(k)) - - stage = int(stage_cfg['stage']) + return ( + False, + "Unknow stage configuration property {}".format(k), + ) + + stage = int(stage_cfg["stage"]) # delete it as its not a proper env var - del stage_cfg['stage'] + del stage_cfg["stage"] self.stage_cfgs[stage] = stage_cfg - return (True,) def get_stage_environment(self, stage): @@ -60,81 +84,105 @@ def update_stage_cfgs(self, runs_per_stage): scfg = self.stage_cfgs[stage] if "PROCS" not in scfg: # + 2: one for swift and one for db rank - scfg['PROCS'] = str(runs + 2) + scfg["PROCS"] = str(runs + 2) if "PPN" not in scfg: - scfg['PPN'] = str(1) - + scfg["PPN"] = str(1) + # update any numeric vals to str values as required for env vars self._vars_to_string(scfg) else: # + 2: one for swift and one for db rank - self.stage_cfgs[stage] = {'PROCS' : str(runs + 2), 'PPN' : str(1)} - + self.stage_cfgs[stage] = {"PROCS": str(runs + 2), "PPN": str(1)} + @property def site(self): - return self.cfg['site'] + return self.cfg["site"] @property def plan(self): - return self.cfg['plan'] + return self.cfg["plan"] @property def submit_script(self): - return self.cfg['submit_script'] + return self.cfg["submit_script"] @property def first_stage(self): - return self.cfg['first_stage'] + return self.cfg["first_stage"] @property def first_stage_parent_directory(self): - return self.cfg['first_stage_parent_directory'] + return self.cfg["first_stage_parent_directory"] @first_stage.setter def first_stage(self, value): - self.cfg['first_stage'] = value + self.cfg["first_stage"] = value @first_stage_parent_directory.setter def first_stage_parent_directory(self, value): - self.cfg['first_stage_parent_directory'] = value + self.cfg["first_stage_parent_directory"] = value @property def upf_directory(self): - return self.cfg['upf_directory'] + return self.cfg["upf_directory"] @property def stages(self): - return self.cfg['stages'] - + return self.cfg["stages"] + @stages.setter def stages(self, value): - self.cfg['stages'] = value + self.cfg["stages"] = value @property def stage_cfg_script(self): - return self.cfg['stage_cfg_script'] + return self.cfg["stage_cfg_script"] @property def job_chain_arg(self): - return self.cfg['job_chain_arg'] + return self.cfg["job_chain_arg"] def create_job_chain_directive(self, job_id): - return self.job_chain_arg.replace('', job_id) + return self.job_chain_arg.replace("", job_id) + - def parse_arguments(): parser = argparse.ArgumentParser() # parser.add_argument('--plan', type=str, default='plan.json', # help='plan data file') - parser.add_argument('--stages', type=int, default=-1, - help='number of stages to run (overrides configuration file if non-0)') - parser.add_argument('--config', type=str, default=None, required=True, - help='the configuration file in json format') - parser.add_argument('--dry_run', action='store_true', - help="Runs the workflow with actual job submission, displaying each job's configuration") - - parser.add_argument('--first_stage', type=int, default=1, help='the stage to begin the workflow with') - parser.add_argument('--first_stage_parent_directory', type=str, default='', help='the directory containing the parent model runs for the initial stage, if initial_stage > 1') + parser.add_argument( + "--stages", + type=int, + default=-1, + help="number of stages to run (overrides configuration file if non-0)", + ) + parser.add_argument( + "--config", + type=str, + default=None, + required=True, + help="the configuration file in json format", + ) + parser.add_argument( + "--dry_run", + action="store_true", + help= + "Runs the workflow with actual job submission, displaying each job's configuration", + ) + + parser.add_argument( + "--first_stage", + type=int, + default=1, + help="the stage to begin the workflow with", + ) + parser.add_argument( + "--first_stage_parent_directory", + type=str, + default="", + help= + "the directory containing the parent model runs for the initial stage, if initial_stage > 1", + ) # parser.add_argument('--upf_dir', type=str, default=None, required=True, # help='the output directory for the generated upf files') @@ -145,55 +193,67 @@ def parse_arguments(): return parser.parse_args() + def parse_run_vars(outs): - to_prefix = 'TURBINE_OUTPUT=' - job_id_prefix = 'JOB_ID=' + to_prefix = "TURBINE_OUTPUT=" + job_id_prefix = "JOB_ID=" str_io = io.StringIO(outs) - turbine_output = '' - job_id = '' + turbine_output = "" + job_id = "" for line in str_io.readlines(): line = line.strip() if line.startswith(to_prefix): - turbine_output = line[len(to_prefix) : ] + turbine_output = line[len(to_prefix):] elif line.startswith(job_id_prefix): - job_id = line[len(job_id_prefix) : ] - + job_id = line[len(job_id_prefix):] + return (turbine_output, job_id) def run_script(cfg, args, stage): cmd = [cfg.submit_script] + args env = cfg.get_stage_environment(stage) - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env) + p = subprocess.Popen(cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + env=env) # stderr is redirected to stdout outs, _ = p.communicate() - return outs.decode('utf-8') + return outs.decode("utf-8") + def run_dry_run(upfs, cfg): for i, upf in enumerate(upfs): # UPFS are in stage order stage = i + cfg.first_stage - args = [cfg.site, '-a', cfg.stage_cfg_script, cfg.plan, upf, str(stage)] + args = [cfg.site, "-a", cfg.stage_cfg_script, cfg.plan, upf, str(stage)] if i > 0: - args += ['', '{}'.format(cfg.job_chain_arg)] + args += ["", "{}".format(cfg.job_chain_arg)] elif cfg.first_stage > 1: - args += [cfg.first_stage_parent_directory, '## JOB 0'] + args += [cfg.first_stage_parent_directory, "## JOB 0"] else: - args += ['job0', '## JOB 0'] + args += ["job0", "## JOB 0"] - print('\n########### DRY RUN JOB {}, Stage {} ##############'.format(stage - cfg.first_stage + 1, stage)) - print("Running: {} {}".format(cfg.submit_script, ' '.join(args))) + print("\n########### DRY RUN JOB {}, Stage {} ##############".format( + stage - cfg.first_stage + 1, stage)) + print("Running: {} {}".format(cfg.submit_script, " ".join(args))) env = cfg.get_stage_environment(stage) - if 'TURBINE_DIRECTIVE_ARGS' in env: - env['TURBINE_DIRECTIVE_ARGS'] = '{}\\n{}'.format(args[7], env['TURBINE_DIRECTIVE_ARGS']) + if "TURBINE_DIRECTIVE_ARGS" in env: + env["TURBINE_DIRECTIVE_ARGS"] = "{}\\n{}".format( + args[7], env["TURBINE_DIRECTIVE_ARGS"]) else: - env['TURBINE_DIRECTIVE_ARGS'] = args[7] - p = subprocess.Popen(['bash', "-c", "source {}".format(cfg.stage_cfg_script)], stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, env=env) + env["TURBINE_DIRECTIVE_ARGS"] = args[7] + p = subprocess.Popen( + ["bash", "-c", "source {}".format(cfg.stage_cfg_script)], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + env=env, + ) # stderr is redirected to stdout outs, _ = p.communicate() - print(outs.decode('utf-8')) + print(outs.decode("utf-8")) + def run_upfs(upfs, cfg): job_id = None @@ -201,27 +261,29 @@ def run_upfs(upfs, cfg): for i, upf in enumerate(upfs): # UPFS are in stage order stage = i + cfg.first_stage - args = [cfg.site, '-a', cfg.stage_cfg_script, cfg.plan, upf, str(stage)] + args = [cfg.site, "-a", cfg.stage_cfg_script, cfg.plan, upf, str(stage)] if job_id: # at least second iteration args += [turbine_output, cfg.create_job_chain_directive(job_id)] elif cfg.first_stage > 1: - args += [cfg.first_stage_parent_directory, '## JOB 0'] + args += [cfg.first_stage_parent_directory, "## JOB 0"] else: - args += ['job0', '## JOB 0'] + args += ["job0", "## JOB 0"] outs = run_script(cfg, args, stage) turbine_output, job_id = parse_run_vars(outs) exp_id = os.path.basename(turbine_output) - print('\n########### JOB {} - Stage {} - {} - {} ##############'.format(stage - cfg.first_stage + 1,stage, exp_id, job_id)) - print("Running: {} {}".format(cfg.submit_script, ' '.join(args))) + print("\n########### JOB {} - Stage {} - {} - {} ##############".format( + stage - cfg.first_stage + 1, stage, exp_id, job_id)) + print("Running: {} {}".format(cfg.submit_script, " ".join(args))) print(outs) - print('TURBINE_OUTPUT: {}'.format(turbine_output)) - print('JOB_ID: {}\n'.format(job_id)) + print("TURBINE_OUTPUT: {}".format(turbine_output)) + print("JOB_ID: {}\n".format(job_id)) if not job_id: print("JOB_ID NOT FOUND - ABORTING RUNS") break + def get_plan_info(plan_file): plan_dict = plangen.load_plan(plan_file) # key of first entry is the root node @@ -230,42 +292,45 @@ def get_plan_info(plan_file): total_stages = -1 total_nodes = -1 for k in iter_pd: - # has skipped the root node, so we can get + # has skipped the root node, so we can get # the second element in val - vals = (k.split(".")) + vals = k.split(".") n_vals = len(vals) total_stages = max(total_stages, n_vals) total_nodes = max(total_nodes, int(vals[1])) - + return (root_node, total_stages, total_nodes) + def generate_upfs(prefix, cfg, root_nodes, n_nodes): parents = root_nodes - upf_prefix = '{}/{}_'.format(cfg.upf_directory, prefix) + upf_prefix = "{}/{}_".format(cfg.upf_directory, prefix) upfs = [] counts = [] for s in range(cfg.first_stage, cfg.first_stage + cfg.stages): - upf_path = '{}s{}_upf.txt'.format(upf_prefix, s) + upf_path = "{}s{}_upf.txt".format(upf_prefix, s) parents = generate_stage(parents, n_nodes, upf_path) upfs.append(upf_path) counts.append(len(parents)) return (upfs, counts) + def generate_stage(parents, n_nodes, f_path): children = [] - with open(f_path, 'w') as f_out: + with open(f_path, "w") as f_out: for p in parents: for n in range(1, n_nodes + 1): - child = '{}.{}'.format(p, n) - f_out.write('{}\n'.format(child)) + child = "{}.{}".format(p, n) + f_out.write("{}\n".format(child)) children.append(child) # print('Stage {}: {}'.format(stage, ' '.join(children))) return children - + + def parse_config(args): cfg = None - with open(args.config, 'r') as fin: + with open(args.config, "r") as fin: cfg = Config(json.load(fin)) result = cfg.validate() if not result[0]: @@ -274,40 +339,44 @@ def parse_config(args): if args.stages != 0: cfg.stages = args.stages - + if args.first_stage != 1: cfg.first_stage = args.first_stage - - if args.first_stage_parent_directory != '': + + if args.first_stage_parent_directory != "": cfg.first_stage_parent_directory = args.first_stage_parent_directory return cfg + def compute_parent_nodes(root_node, stage, n_nodes): - """ Computes the the parents nodes of the specified stage """ + """Computes the the parents nodes of the specified stage.""" root_nodes = [root_node] for _ in range(1, stage): children = [] for r in root_nodes: for n in range(1, n_nodes + 1): - child = '{}.{}'.format(r, n) + child = "{}.{}".format(r, n) children.append(child) root_nodes = children - + return root_nodes - + def run(args): cfg = parse_config(args) root_node, total_stages, n_nodes = get_plan_info(cfg.plan) if cfg.first_stage > total_stages: - print("First stage must be less than or equal to total number of stages") + print( + "First stage must be less than or equal to total number of stages") sys.exit() - if cfg.first_stage > 1 and ('first_stage_parent_directory' not in cfg.cfg or - cfg.cfg['first_stage_parent_directory'] == ''): - print("Missing required 'first_stage_parent_directory' argument, when first_stage > 1") + if cfg.first_stage > 1 and ("first_stage_parent_directory" not in cfg.cfg or + cfg.cfg["first_stage_parent_directory"] == ""): + print( + "Missing required 'first_stage_parent_directory' argument, when first_stage > 1" + ) sys.exit() if cfg.stages == -1 or cfg.stages >= total_stages: @@ -318,15 +387,22 @@ def run(args): upfs, runs_per_stage = generate_upfs(prefix, cfg, root_nodes, n_nodes) cfg.update_stage_cfgs(runs_per_stage) - print("\nTotal Jobs: {}\nTotal Stages: {}\nNodes: {}".format(cfg.stages, cfg.stages, n_nodes)) - print("Site: {}\nPlan: {}\nSubmit Script: {}\nStage Configuration Script:{}\nUPF directory: {}".format(cfg.site, cfg.plan, - cfg.submit_script, cfg.stage_cfg_script, cfg.upf_directory)) + print("\nTotal Jobs: {}\nTotal Stages: {}\nNodes: {}".format( + cfg.stages, cfg.stages, n_nodes)) + print( + "Site: {}\nPlan: {}\nSubmit Script: {}\nStage Configuration Script:{}\nUPF directory: {}" + .format( + cfg.site, + cfg.plan, + cfg.submit_script, + cfg.stage_cfg_script, + cfg.upf_directory, + )) for i, c in enumerate(runs_per_stage): stage = cfg.first_stage + i scfg = cfg.stage_cfgs[stage] - print("\tStage: {}, UPF: {}, Model Runs: {}, PROCS: {}, PPN: {}".format(stage, - os.path.basename(upfs[i]), c, scfg['PROCS'], scfg['PPN'])) - + print("\tStage: {}, UPF: {}, Model Runs: {}, PROCS: {}, PPN: {}".format( + stage, os.path.basename(upfs[i]), c, scfg["PROCS"], scfg["PPN"])) # TODO Add Dry Run -- for each upf source the cfg-sys as a POpen if args.dry_run: @@ -334,6 +410,7 @@ def run(args): else: run_upfs(upfs, cfg) + if __name__ == "__main__": args = parse_arguments() run(args) diff --git a/workflows/cp-leaveout/py/tests/.gitignore b/workflows/cp-leaveout/py/tests/.gitignore index 91273a57..7634d2a9 100644 --- a/workflows/cp-leaveout/py/tests/.gitignore +++ b/workflows/cp-leaveout/py/tests/.gitignore @@ -1 +1 @@ -test_out/ \ No newline at end of file +test_out/ diff --git a/workflows/cp-leaveout/py/tests/test_run_chained.py b/workflows/cp-leaveout/py/tests/test_run_chained.py index 79518cf7..bc343026 100644 --- a/workflows/cp-leaveout/py/tests/test_run_chained.py +++ b/workflows/cp-leaveout/py/tests/test_run_chained.py @@ -1,27 +1,29 @@ # Run with python -m unittest tests.test_run_chained from parent directory - -import unittest import os +import unittest import run_chained + class RunChainedTests(unittest.TestCase): def test_root_nodes(self): - root_node = '1' + root_node = "1" first_stage = 1 n_nodes = 4 - root_nodes = run_chained.compute_parent_nodes(root_node, first_stage, n_nodes) - self.assertEqual(['1'], root_nodes) + root_nodes = run_chained.compute_parent_nodes(root_node, first_stage, + n_nodes) + self.assertEqual(["1"], root_nodes) first_stage = 3 n_nodes = 4 - root_nodes = run_chained.compute_parent_nodes(root_node, first_stage, n_nodes) + root_nodes = run_chained.compute_parent_nodes(root_node, first_stage, + n_nodes) self.assertEqual(16, len(root_nodes)) for a in range(1, 5): for b in range(1, 5): - self.assertTrue('1.{}.{}'.format(a, b) in root_nodes) + self.assertTrue("1.{}.{}".format(a, b) in root_nodes) def read_lines(self, fname): with open(fname) as f_in: @@ -30,37 +32,45 @@ def read_lines(self, fname): def test_upfs(self): - if os.path.exists('./tests/test_out/test_upf_s1_upf.txt'): - os.remove('./tests/test_out/test_upf_s1_upf.txt') + if os.path.exists("./tests/test_out/test_upf_s1_upf.txt"): + os.remove("./tests/test_out/test_upf_s1_upf.txt") - args = {'upf_directory' : './tests/test_out', 'first_stage' : 1, 'stages' : 1} + args = { + "upf_directory": "./tests/test_out", + "first_stage": 1, + "stages": 1 + } cfg = run_chained.Config(args) root_nodes = run_chained.compute_parent_nodes(1, 1, 4) - run_chained.generate_upfs('test_upf', cfg, root_nodes, 4) - vals = self.read_lines('./tests/test_out/test_upf_s1_upf.txt') - self.assertEqual(['1.1', '1.2', '1.3', '1.4'], vals) + run_chained.generate_upfs("test_upf", cfg, root_nodes, 4) + vals = self.read_lines("./tests/test_out/test_upf_s1_upf.txt") + self.assertEqual(["1.1", "1.2", "1.3", "1.4"], vals) - if os.path.exists('./tests/test_out/test_upf_s2_upf.txt'): - os.remove('./tests/test_out/test_upf_s2_upf.txt') - os.remove('./tests/test_out/test_upf_s3_upf.txt') + if os.path.exists("./tests/test_out/test_upf_s2_upf.txt"): + os.remove("./tests/test_out/test_upf_s2_upf.txt") + os.remove("./tests/test_out/test_upf_s3_upf.txt") - args = {'upf_directory' : './tests/test_out', 'first_stage' : 2, 'stages' : 2} + args = { + "upf_directory": "./tests/test_out", + "first_stage": 2, + "stages": 2 + } cfg = run_chained.Config(args) root_nodes = run_chained.compute_parent_nodes(1, 2, 4) - upfs, runs_per_stage = run_chained.generate_upfs('test_upf', cfg, root_nodes, 4) + upfs, runs_per_stage = run_chained.generate_upfs( + "test_upf", cfg, root_nodes, 4) vals = self.read_lines(upfs[0]) self.assertEqual(16, len(vals)) self.assertEqual(16, runs_per_stage[0]) for a in range(1, 5): for b in range(1, 5): - self.assertTrue('1.{}.{}'.format(a, b) in vals) - + self.assertTrue("1.{}.{}".format(a, b) in vals) + vals = self.read_lines(upfs[1]) self.assertEqual(64, len(vals)) self.assertEqual(64, runs_per_stage[1]) for a in range(1, 5): for b in range(1, 5): for c in range(1, 5): - self.assertTrue('1.{}.{}.{}'.format(a, b, c) in vals) - \ No newline at end of file + self.assertTrue("1.{}.{}.{}".format(a, b, c) in vals) diff --git a/workflows/cp-leaveout/scripts/Node.py b/workflows/cp-leaveout/scripts/Node.py index a6aa0e4c..0daabc0d 100644 --- a/workflows/cp-leaveout/scripts/Node.py +++ b/workflows/cp-leaveout/scripts/Node.py @@ -1,77 +1,138 @@ - # NODE PY # The training node information as stored in the logs # See the footer of this file for example log text that is parsed here +# This class must remain simple enough to pickle +# thus it cannot contain its own logger (Python 3.6 issue 30520) + +# import math -import math class Node: # TensorFlow is done when you see this training_done = "[==============================]" - def __init__(self, id=None): + def __init__(self, id=None, logger=None): # The ID is e.g.: "1.2.3" self.id = id # Use string length of id to deduce stage: self.stage = None # Number of training steps performed self.steps = 0 + # Various error metrics: + self.loss = None self.val_loss = None - # Difference wrt parent (lower is better) + self.mse = None + self.mae = None + self.r2 = None + self.corr = None + # The first learning rate: + self.lr_first = None + # The final learning rate: + self.lr_final = None + # Differences wrt parent (lower is better) + self.loss_delta = None self.val_loss_delta = None + # Validation set size + self.val_data = None # Epochs prescribed by the workflow self.epochs_planned = None # Epochs actually run (consider early stopping) - self.epochs_actual = 0 + self.epochs_actual = 0 + # Epochs cumulative: include parents' epochs (CP weight-sharing) + self.epochs_cumul = None self.date_start = None - self.date_stop = None + self.date_stop = None + # Time to build dataframe + self.build_df = None + # Time to load initial weights + self.load_initial = None # Training time in seconds self.time = 0 + # Training run restarts- each log file makes a new segment + self.segment = 0 + # Time for given segment from "Current time" + self.segments = {} + # Bandwidths for checkpoint write by segment + self.ckpt_writes = {} # Did EarlyStopping stop this node? self.stopped_early = False + # Did the topN module find data for this Node? + self.has_data = True # Did training complete for this node? self.complete = False - self.verbose = False - self.debug("START: " + str(self)) + # Can disable logging here: + self.verbose = True + # self.debug(logger, "START: " + str(self)) - def set_id(self, id): + def set_id(self, id, logger=None): self.id = id - self.stage = (len(self.id) - 1 ) // 2 + self.stage = (len(self.id) - 1) // 2 + # self.debug(logger, "SET ID: " + id) + + def new_segment(self): + self.segment += 1 def parent(self): if self.stage == 1: return None return self.id[0:-2] + def __repr__(self): + return self.__str__() + def __str__(self): special = "" - if not self.complete: - special = " INCOMPLETE!" - if self.stopped_early: - special = " EARLY STOP!" - return "Node [%s]: %s (epochs=%i/%s, val_loss=%s)%s" % \ - (Node.maybe_str_integer(self.stage), - self.id, - self.epochs_actual, - Node.maybe_str_integer(self.epochs_planned), - Node.maybe_str_float(self.val_loss, "%0.6f"), - special) + if not self.has_data: + special = "NO DATA!" + else: + if not self.complete: + special = " INCOMPLETE!" + if self.stopped_early: + special = " EARLY STOP!" + return "Node [%s]: %s (epochs=%i/%s, loss=%s, val_loss=%s)%s" % ( + Node.maybe_str_integer(self.stage), + self.id, + self.epochs_actual, + Node.maybe_str_integer(self.epochs_planned), + Node.maybe_str_float(self.loss, "%0.6f"), + Node.maybe_str_float(self.val_loss, "%0.6f"), + special, + ) def str_table(self): - ''' Like str() but uses fixed-width fields ''' + """Like str() but uses fixed-width fields.""" special = "" + if not self.has_data: + return "%-13s : %i NO-DATA" % (self.id, self.stage) if not self.complete: special = " INCOMPLETE!" if self.stopped_early: special = " EARLY STOP!" - return "%-12s : %i : %2i / %2i : %0.5f : %s - %s : %s" % \ - (self.id, self.stage, - self.epochs_actual, self.epochs_planned, - self.val_loss, - self.date_start, self.date_stop, - special) + return "%-13s : %i : %2s / %2s : %s - %s : %s : %s" % ( + self.id, + self.stage, + Node.maybe_str_integer(self.epochs_actual), + Node.maybe_str_integer(self.epochs_planned), + str(self.date_start), + str(self.date_stop), + self.str_errors(), + special, + ) + + def str_errors(self): + """Return errors as big string.""" + fmt = "%0.6f" + s = ("loss: %s vl: %s mse: %s mae: %s r2: %s corr: %s") % ( + Node.maybe_str_float(self.loss, fmt), + Node.maybe_str_float(self.val_loss, fmt), + Node.maybe_str_float(self.mse, fmt), + Node.maybe_str_float(self.mae, fmt), + Node.maybe_str_float(self.r2, fmt), + Node.maybe_str_float(self.corr, fmt), + ) + return s def maybe_str_integer(i): if i is None: @@ -83,77 +144,225 @@ def maybe_str_float(f, spec): return "?" return spec % f - def parse_epochs(self, line): + def bad_line(self, line): + print("") + print("BAD LINE: " + line) + print("") + + def parse_epochs(self, line, logger=None): tokens = line.split() self.epochs_planned = int(tokens[-1].strip()) - self.debug("epochs_planned: %i" % self.epochs_planned) + self.trace(logger, "epochs_planned: %i" % self.epochs_planned) - def stop_early(self): + def parse_load_initial(self, line, logger=None): + tokens = line.split() + self.load_initial = float(tokens[4]) + # print("load_initial: " + str(self.load_initial)) + + def parse_epoch_status(self, line, logger=None): + tokens = line.split() + assert len(tokens) == 2, "bad line: " + line + ints = tokens[1].split("/") + assert len(tokens) == 2 + self.epochs_actual = int(ints[0]) + self.trace(logger, "epochs_actual: " + str(self.epochs_actual)) + + def parse_current_time(self, line, logger=None): + tokens = line.split() + assert len(tokens) == 3, "bad line: " + line + # Chop off leading dots: ....123.123 + t = tokens[2][4:] + self.segments[self.segment] = float(t) + # print("%-13s %i %r" % (self.id, self.segment, self.segments)) + + def parse_model_write(self, line, logger=None): + tokens = line.split() + t = float(tokens[7][1:]) + self.ckpt_writes[self.segment] = t + self.trace(logger, "model_write: %0.3f" % t) + + def stop_early(self, logger=None): self.stopped_early = True - self.debug("STOP EARLY") + self.trace(logger, "STOP EARLY") def parse_date_start(self, line): tokens = line.split() self.date_start = tokens[0] + " " + tokens[1] - def parse_date_stop(self, line): + def parse_date_stop(self, line, logger=None): tokens = line.split() self.date_stop = tokens[0] + " " + tokens[1] - if self.epochs_actual == self.epochs_planned or \ - self.stopped_early: + if self.epochs_planned is None: + self.trace(logger, "STOP : epochs_planned=None") + return + if self.epochs_actual == self.epochs_planned or self.stopped_early: self.complete = True - self.debug("COMPLETE") + self.trace(logger, "COMPLETE") - def parse_training_done(self, line): - self.epochs_actual += 1 - # Find the location of training_done (td) (to accommodate prefixes) - tokens = line.split() - td = 0 - while tokens[td] != Node.training_done: - td = td + 1 - stepii = tokens[td-1].split("/") - self.steps += int(stepii[0]) - time_s = tokens[td+2] # e.g., "321s" - self.time += int(time_s[0:-1]) - # Always collect val_loss: early stopping could happen: - self.val_loss = float(tokens[td+15]) + def parse_training_done(self, line, logger=None): + # The current epoch should already be set + # by parse_epoch_status() + # First, find the location of training_done (td) + # (to accommodate prefixes) + try: + tokens = line.split() + td = 0 + while tokens[td] != Node.training_done: + td = td + 1 + stepii = tokens[td - 1].split("/") + self.steps += int(stepii[0]) + time_s = tokens[td + 2] # e.g., "321s" + self.time += int(time_s[0:-1]) + # Always collect losses: early stopping could happen: + self.loss = float(tokens[td + 5]) + self.val_loss = float(tokens[td + 14]) + except Exception as e: + self.bad_line(line) + raise (e) + + def parse_val_data(self, fp): + """fp is the file pointer to save/python.log If val data is not found, + node.val_data will remain None.""" + marker = "val data = " + marker_length = len(marker) + while True: + line = fp.readline() + if line == "": + break + index = line.find("val data =") + if index == -1: + continue + tail = line[index + marker_length:] + comma = tail.find(",") + value_string = tail[:comma] + self.val_data = int(value_string) + + def parse_python_log(self, fp): + """fp is the file pointer to save/python.log If lines are not found, + node.mse, etc., will remain None.""" + marker = "Comparing y_true " + # The marker is just after the date: + # We search this way for speed. + date_len = len("YYYY-MM-DD HH:MM:SS ") # trailing space + while True: + line = fp.readline() + if line == "": + break + if line.startswith("Epoch ", date_len) and \ + "lr=" in line: + tokens = line.split("=") + lr = float(tokens[1]) + # print("%s lr=%0.6f" % (self.id, lr)) + if self.lr_first is None: + self.lr_first = lr + else: + self.lr_final = lr + if line.startswith(marker, date_len): + line = fp.readline() + tokens = check_token(line, 2, "mse:") + self.mse = float(tokens[3]) + # print("mse: " + str(self.mse)) + line = fp.readline() + tokens = check_token(line, 2, "mae:") + self.mae = float(tokens[3]) + line = fp.readline() + tokens = check_token(line, 2, "r2:") + self.r2 = float(tokens[3]) + line = fp.readline() + tokens = check_token(line, 2, "corr:") + self.corr = float(tokens[3]) + # Loop! We want the last such values in the file + + def get_loss_delta(node): + if node.loss_delta is None: + raise ValueError("No loss_delta!") + return node.loss_delta def get_val_loss_delta(node): - ''' For sorting ''' - if node.val_loss_delta == None: + if node.val_loss_delta is None: raise ValueError("No val_loss_delta!") return node.val_loss_delta - def debug(self, message): - if not self.verbose: + def debug(self, logger, message): + # assert(logger != None) # Use this to find missing loggers + if logger is None or not self.verbose: return - print("NODE: " + message) + logger.debug("NODE: [%s] %s" % (self.id, message)) - def total_time(self, nodes): + def trace(self, logger, message): + # assert(logger != None) # Use this to find missing loggers + if logger is None or not self.verbose: + return + import logging + + logger.log(level=logging.DEBUG - 5, + msg=("NODE: [%s] %s" % (self.id, message))) + + def get_time_cumul(self, nodes): + """Time cumulative including parents' time.""" parent = self.parent() - if parent == None: + if parent is None: return self.time - return self.time + nodes[parent].total_time(nodes) + return self.time + nodes[parent].get_time_cumul(nodes) + + def get_segments(self): + total = 0 + for s, t in self.segments.items(): + total += t + return total + + def get_epochs_cumul(self, nodes): + """Epochs cumulative including parents' epochs.""" + if self.epochs_cumul is not None: + return self.epochs_cumul + # Initialize: + self.epochs_cumul = self.epochs_actual + parent = self.parent() + if parent is not None and parent in nodes: + # Add parents: + self.epochs_cumul += nodes[parent].get_epochs_cumul(nodes) + return self.epochs_cumul + + +def check_token(line, index, token): + """Assert that token is in line at given index.""" + tokens = line.split() + if tokens[index] != token: + raise Exception( + ("could not find token: '%s'\n" + "in line: '%s'") % (token, line)) + return tokens + + +def check(condition, message): + """Check condition or raise Exception with given message.""" + if not condition: + raise Exception(message) + + +# EXAMPLES: + +# __init__() + +# 2019-12-14 09:46:32 MODEL RUNNER DEBUG node = 1.4.2.1 -''' -EXAMPLES: +# parse_epochs() ==> self.epochs_planned -__init__() +# 2019-12-14 09:46:32 MODEL RUNNER DEBUG epochs = 5 -2019-12-14 09:46:32 MODEL RUNNER DEBUG node = 1.4.2.1 +# parse_epoch_status() (from Keras) -parse_epochs() ==> self.epochs_planned +# Epoch 29/50 -2019-12-14 09:46:32 MODEL RUNNER DEBUG epochs = 5 +# parse_val_data() ==> self.val_data -stop_early() +# 2020-04-15 13:45:41 CV fold 0: train data = 5265, val data = 1400, test data = 0 -Epoch 00004: early stopping +# stop_early() -training_done() +# Epoch 00004: early stopping -16092/16092 [==============================] - 315s 20ms/step - loss: 0.0065 - mae: 0.0565 - r2: -0.6208 - val_loss: 0.0139 - val_mae: 0.0575 - val_r2: -0.3959 +# training_done() -==> self.epochs_actual, self.val_loss, self.time, self.complete +# 16092/16092 [==============================] - 315s 20ms/step - loss: 0.0065 - mae: 0.0565 - r2: -0.6208 - val_loss: 0.0139 - val_mae: 0.0575 - val_r2: -0.3959 -''' +# ==> self.epochs_actual, self.val_loss, self.time, self.complete diff --git a/workflows/cp-leaveout/scripts/README.adoc b/workflows/cp-leaveout/scripts/README.adoc index 94580170..a9fb34ab 100644 --- a/workflows/cp-leaveout/scripts/README.adoc +++ b/workflows/cp-leaveout/scripts/README.adoc @@ -33,20 +33,22 @@ COMPLETE / TOTAL = 1364 / 1364 : 0 remaining. ==== Generate a Node Pickle (extract-node-info) -This is a Python Pickle containing the Node data. See Node.py . +This makes the Python Pickle containing the Node data. See Node.py . This avoids needing to walk all logs all the time (which takes tens of seconds). ---- -$ scripts/extract-node-info.sh $D +$ scripts/extract-node-info.sh $D [nodes...] ---- +The data structure in the Pickle is a simple dictionary mapping node ID strings e.g. "1.2.3.4" to object of type Node. + ==== Print Node info (print-node-info) -Prints a big table of all Node statistics: +Prints a big table of all Node statistics using the Node Pickle. Format: -NODE STAGE EPOCHS-ACTUAL / EPOCHS-MAX VAL-LOSS TIME-START TIME_STOP EARLY-STOP? +NODE STAGE EPOCHS-ACTUAL / EPOCHS-MAX LOSS VAL-LOSS TIME-START TIME_STOP EARLY-STOP? ---- $ scripts/print-node-info.sh $D @@ -58,6 +60,8 @@ $ scripts/print-node-info.sh $D ... ---- +If specific node IDs are given on the command line, only those records are printed. + ==== Find loss increases (find-loss-increases) Brettin email 2019-12-18: @@ -83,10 +87,32 @@ Analytically determine the number of Nodes in the workflow given N and S. Compile workflow statistics +==== Report learning rates + +Dump start and end learning rates into `lrs.txt` + +---- +$ scripts/report-lrs.sh $D > $D/lrs.txt +---- + +=== Data management + +==== mk-log-tar.sh + +Make a tarball of just the important logs (not the big HDF files). + +==== cp-subtree.sh + +Make a copy of experiment run subtrees; includes a random sample of +leaf nodes and all their parents. + === Analysis for model.log files -These are not really supported for Summit runs because we are using in-memory Python, -but they could be easily fixed. Also, they run against the model.logs and not the Pickle, so they are slow. +These are not really supported for Summit runs +because we are using in-memory Python, +but they could be easily fixed. +Also, they run against the model.logs and not the Pickle, +so they are slow. ==== extract-stats.sh @@ -122,6 +148,14 @@ Average GPU utilization List the Nodes from the JSON file. +==== List Node Singles + +List the Nodes from the JSON file with a single cell line. + +==== Leaf Stats + +Report key stats from the python.log for the given nodes. + ==== tar experiment Make backup tars for experiment data diff --git a/workflows/cp-leaveout/scripts/avg-stage.py b/workflows/cp-leaveout/scripts/avg-stage.py new file mode 100644 index 00000000..abeb70da --- /dev/null +++ b/workflows/cp-leaveout/scripts/avg-stage.py @@ -0,0 +1,61 @@ +# AVG STAGE PY + +import argparse +import os +import pickle +import statistics + +from utils import fail + +STAGE_ANY = 0 + +parser = argparse.ArgumentParser(description="Finds loss increases.") +parser.add_argument("directory", help="The experiment directory (EXPID)") +parser.add_argument("--filename", + "-f", + default="node-info", + help="Change the node pkl file name") +args = parser.parse_args() + +node_pkl = args.directory + "/" + args.filename + ".pkl" + +try: + with open(node_pkl, "rb") as fp: + # This is a dict ("node_id" -> Node) + data = pickle.load(fp) +except IOError as e: + fail(e, os.EX_IOERR, "Could not read: " + node_pkl) + +print("total nodes: %i" % len(data)) + +# Total Node count: +total = 0 +# stages = { 1:[], 2:[], 3:[], 4:[], 5:[] } +# epochs = { 1:[], 2:[], 3:[], 4:[], 5:[] } +times = {1: [], 2: [], 3: [], 4: [], 5: []} +vlosses = {1: [], 2: [], 3: [], 4: [], 5: []} + +for node_id in data.keys(): + node = data[node_id] + if not node.complete: + continue + # stages[node.stage].append(node.time) + # epochs[node.stage].append(node.epochs_actual) + times[node.stage].append(node.get_segments() / node.epochs_actual) + vlosses[node.stage].append(node.val_loss) + if node.stage == 3: + print("%s %0.2f %i" % + (node.id, node.get_segments(), node.epochs_actual)) + +with open(args.directory + "/times.data", "w") as fp: + for stage in times.keys(): + count = len(times[stage]) + # print("stage: %i (%i) %r" % (stage, count, times[stage])) + timer = statistics.mean(times[stage]) + fp.write("%i %0.2f # count=%i\n" % (stage, timer, count)) + +with open(args.directory + "/vloss.data", "w") as fp: + for stage in times.keys(): + count = len(times[stage]) + vloss = statistics.mean(vlosses[stage]) + fp.write("%i %0.6f # count=%i\n" % (stage, vloss, count)) diff --git a/workflows/cp-leaveout/scripts/avg-stage.sh b/workflows/cp-leaveout/scripts/avg-stage.sh new file mode 100755 index 00000000..913d0353 --- /dev/null +++ b/workflows/cp-leaveout/scripts/avg-stage.sh @@ -0,0 +1,26 @@ +#!/bin/bash +set -eu + +# AVG STAGE SH + +# Input: Provide an experiment directory +# Output: Per-stage averages printed to plottable files + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + DIR - ${*} + +if [[ ! -d $DIR ]] +then + echo "$0: Given experiment directory does not exist: $DIR" + exit 1 +fi + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +set -x +python3 -u $THIS/avg-stage.py ${*} diff --git a/workflows/cp-leaveout/scripts/avg-utils.py b/workflows/cp-leaveout/scripts/avg-utils.py index b655944a..a0739b65 100644 --- a/workflows/cp-leaveout/scripts/avg-utils.py +++ b/workflows/cp-leaveout/scripts/avg-utils.py @@ -1,4 +1,3 @@ - import sys import numpy diff --git a/workflows/cp-leaveout/scripts/baseline-error-list.sh b/workflows/cp-leaveout/scripts/baseline-error-list.sh new file mode 100755 index 00000000..eb5392bb --- /dev/null +++ b/workflows/cp-leaveout/scripts/baseline-error-list.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -eu + +# BASELINE ERROR LIST SH +# WIP: Script to extract python.logs from a given DIR and STAGE + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + -H "and OUTPUT filename" \ + DIR STAGE OUTPUT - ${*} + +if [[ ! -d $DIR ]] +then + echo "$0: Given experiment directory does not exist: $DIR" + exit 1 +fi + +for F in experiments/X385/run/?.?.?.?.?.?/save/python.log +do + echo $( basename $( dirname $( dirname $F ) ) ) +done > $OUTPUT diff --git a/workflows/cp-leaveout/scripts/check-db-pkl.py b/workflows/cp-leaveout/scripts/check-db-pkl.py index ef1ddfaf..cac64bbb 100644 --- a/workflows/cp-leaveout/scripts/check-db-pkl.py +++ b/workflows/cp-leaveout/scripts/check-db-pkl.py @@ -1,26 +1,26 @@ - # CHECK DB PKL PY # WIP -import argparse, os, pickle, sys - +import argparse +import os +import pickle import sqlite3 +import sys from sqlite3 import Error as db_Error from Node import Node from utils import abort -parser = argparse.ArgumentParser(description='Parse all log files') -parser.add_argument('directory', - help='The experiment directory (EXPID)') +parser = argparse.ArgumentParser(description="Parse all log files") +parser.add_argument("directory", help="The experiment directory (EXPID)") args = parser.parse_args() node_pkl = args.directory + "/node-info.pkl" -db_file = args.directory + "/cplo.db" +db_file = args.directory + "/cplo.db" -try: - with open(node_pkl, 'rb') as fp: +try: + with open(node_pkl, "rb") as fp: data = pickle.load(fp) except IOError as e: abort(e, os.EX_IOERR, "Could not load pickle: " + node_pkl) @@ -37,7 +37,6 @@ if d == None: break print(str(d[0])) - cursor.close() conn.close() diff --git a/workflows/cp-leaveout/scripts/check-run.sh b/workflows/cp-leaveout/scripts/check-run.sh index 406daf08..62200d3b 100755 --- a/workflows/cp-leaveout/scripts/check-run.sh +++ b/workflows/cp-leaveout/scripts/check-run.sh @@ -13,7 +13,7 @@ source $SUPERVISOR/workflows/common/sh/utils.sh SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ DIR - ${*} -if ! [ -d $DIR ] +if ! [[ -d $DIR ]] then echo "Does not exist: $DIR" exit 1 @@ -27,17 +27,31 @@ SUCCESS=0 if grep -q "User defined signal 2" $DIR/output.txt then + # Summit time out + echo "Job timed out normally." + SUCCESS=1 + +elif grep -q "DUE TO TIME LIMIT" $DIR/output.txt +then + # Frontier time out echo "Job timed out normally." SUCCESS=1 -fi -if grep -q "TURBINE: EXIT CODE: 0" $DIR/output.txt +elif grep -q "EXIT CODE: 0" $DIR/output.txt then echo "Job completed normally." - grep "TURBINE: MPIEXEC TIME: " $DIR/output.txt + grep "MPIEXEC TIME: " $DIR/output.txt SUCCESS=1 fi +if (( ! SUCCESS )) +then + # Find MPI Aborts on Frontier + grep "START:" $DIR/output.txt + grep "MPICH .* Abort" $DIR/output.txt | \ + cut --delimiter ' ' --fields=1-12 +fi + if (( ! SUCCESS )) then echo "Job failed!" diff --git a/workflows/cp-leaveout/scripts/clean-ckpts-run.sh b/workflows/cp-leaveout/scripts/clean-ckpts-run.sh new file mode 100755 index 00000000..67ef5240 --- /dev/null +++ b/workflows/cp-leaveout/scripts/clean-ckpts-run.sh @@ -0,0 +1,51 @@ +#!/bin/bash +set -eu + +# CLEAN CKPTS RUN SH + +# See ./clean-ckpts.sh + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an run DIR (e.g., .../experiments/X042/run/1.2.3)!" \ + DIR - ${*} + +if [[ ! -d $DIR ]] +then + echo "$0: Given run directory does not exist: $DIR" + exit 1 +fi + +echo "RUN: $DIR" + +if ! [[ -d $DIR/save/ckpts/epochs ]] +then + echo "No epochs directory." + exit +fi + +cd $DIR/save/ckpts/epochs +MODELS=( $( ls ) ) + +N=${#MODELS[@]} +echo "MODELS: $N" + +# Do not clean the last 3 models +for (( i=0 ; i<$N-3 ; i++ )) +do + MODEL=${MODELS[$i]} + # Use 10# to force MODEL as base-10 + # (Bash treats e.g. MODEL=010 as octal) + if (( 10#$MODEL % 5 == 0 )) + then + continue + fi + if ! [[ -f $MODEL/model.h5 ]] + then + continue + fi + rm -v $MODEL/model.h5 +done diff --git a/workflows/cp-leaveout/scripts/clean-ckpts.sh b/workflows/cp-leaveout/scripts/clean-ckpts.sh new file mode 100755 index 00000000..95b787dd --- /dev/null +++ b/workflows/cp-leaveout/scripts/clean-ckpts.sh @@ -0,0 +1,27 @@ +#!/bin/sh + +# CLEAN CKPTS SH + +# Clean up old checkpoints + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + DIR - ${*} + +if [[ ! -d $DIR ]] +then + echo "$0: Given experiment directory does not exist: $DIR" + exit 1 +fi + +RUNS=( $( echo $DIR/run/* ) ) + +for RUN in ${RUNS[@]} +do + $THIS/clean-ckpts-run.sh $RUN + echo +done diff --git a/workflows/cp-leaveout/scripts/clean-top21.py b/workflows/cp-leaveout/scripts/clean-top21.py new file mode 100644 index 00000000..d16126bd --- /dev/null +++ b/workflows/cp-leaveout/scripts/clean-top21.py @@ -0,0 +1,91 @@ +# CLEAN TOP21 +# Cleans the top21 file so only LINCS records are present +# File names are hard-coded but easy to change + +import logging + +logger = logging.getLogger("clean-top21") +logger.setLevel(logging.DEBUG) +ch = logging.StreamHandler() +ch.setLevel(logging.DEBUG) +formatter = logging.Formatter("%(asctime)s %(message)s", datefmt="%H:%M:%S") +ch.setFormatter(formatter) +logger.addHandler(ch) +logger.info("Start") + +import pandas as pd + +logger.info("Pandas") + +SCRATCH = "/gpfs/alpine/med106/scratch/wozniak" +CANDLE_DATA = SCRATCH + "/CANDLE-Data/ChallengeProblem" + +# The original data from Yoo: +original = CANDLE_DATA + "/top21_2020Jul/top21.h5" +lincs1000 = CANDLE_DATA + "/top21_2020Jul/lincs1000" + +# The file we are creating here: +output = CANDLE_DATA + "/top21_2020Jul/top21-cleaned-dd.h5" + +# List of names in LINCS: +lincs = [] +with open(lincs1000, "r") as fp: + while True: + line = fp.readline() + if len(line) == 0: + break + lincs.append(line.strip()) + +logger.info("lincs length: %i" % len(lincs)) + +store_in = pd.HDFStore(original, "r") +df = store_in.get("df") + +logger.info("HDF Opened.") + +columns = df.columns.to_list() +logger.info("df columns original: %i" % len(columns)) + +# List of dataframe column names to delete: +delete_these = [] + +count_key = 0 +count_GE_N = 0 +count_GE_Y = 0 +count_DD = 0 +count_other = 0 +for column in columns: + if column.startswith("GE_"): + # print("GE " + column) + substring = column[3:] + if substring in lincs: + count_GE_Y += 1 + else: + count_GE_N += 1 + delete_these.append(column) + elif column.startswith("DD_"): + # print("DD " + column) + count_DD += 1 + # delete_these.append(column) + elif column == "AUC" or column == "DRUG" or column == "CELL": + count_key += 1 + else: + print("NO '%s'" % column) + count_other += 1 + +print("count_key: %i" % count_key) +print("count_GE_Y: %i" % count_GE_Y) +print("count_GE_N: %i" % count_GE_N) +print("count_DD: %i" % count_DD) +print("count_other: %i" % count_other) + +logger.info("Scanned.") +logger.info("delete_these: %i" % len(delete_these)) +df.drop(columns=delete_these, inplace=True) +logger.info("df columns after: %i" % len(df.columns.to_list())) + +logger.info("Dropped.") + +df.to_hdf(output, key="df", mode="w") + +logger.info("Wrote.") diff --git a/workflows/cp-leaveout/scripts/compare-errors.py b/workflows/cp-leaveout/scripts/compare-errors.py new file mode 100644 index 00000000..3d612c9e --- /dev/null +++ b/workflows/cp-leaveout/scripts/compare-errors.py @@ -0,0 +1,67 @@ +# COMPARE ERRORS PY + +# Input: Provide two experiment DIRECTORIES and OUTPUT file +# Output: NODE_ID EPOCHS1 ERROR1 EPOCHS2 ERROR2 +# where an ERROR is MSE MAE R2 CORR + +# Could easily be updated to pull out only one error stat +# (see commented code) + +import argparse +import pickle + +parser = argparse.ArgumentParser(description="Parse all log files") +parser.add_argument("directory1", help="The 1st experiment directory (EXPID)") +parser.add_argument("directory2", help="The 2nd experiment directory (EXPID)") +# parser.add_argument("error", +# help="The error type to compare") +parser.add_argument("output", help="The output file") + +args = parser.parse_args() + +# logging.basicConfig(level=logging.DEBUG, format="%(message)s") +# logger = logging.getLogger("compare_errors") + +node_pkl_1 = args.directory1 + "/node-info.pkl" +node_pkl_2 = args.directory2 + "/node-info.pkl" + +# known_errors = ["mse", "mae", "r2", "corr"] +# if args.error not in known_errors: +# print("given error '%s' not in known errors: %s" % +# (args.error, known_errors)) +# exit(1) + +with open(node_pkl_1, "rb") as fp: + nodes_1 = pickle.load(fp) +with open(node_pkl_2, "rb") as fp: + nodes_2 = pickle.load(fp) +# print("%i %i" % (len(nodes_1), len(nodes_2))) + + +def get_errors(node): + return "%f %f %f %f" % (node.mse, node.mae, node.r2, node.corr) + + +# for node_id in nodes_1: +# print(node_id) +# exit(1) + +missing = 0 +count = 0 +with open(args.output, "w") as fp: + for node_id in nodes_2: + if node_id not in nodes_1: + print("missing: " + node_id) + missing += 1 + continue + count += 1 + epochs_1 = nodes_1[node_id].get_epochs_cumul(nodes_1) + errors_1 = get_errors(nodes_1[node_id]) + epochs_2 = nodes_2[node_id].get_epochs_cumul(nodes_2) + errors_2 = get_errors(nodes_2[node_id]) + fp.write("%2i %s %3i %s %3i %s\n" % + (count, node_id, epochs_1, errors_1, epochs_2, errors_2)) + +print("compared: %2i" % count) +print("missing: %2i" % missing) +print("wrote: %s" % args.output) diff --git a/workflows/cp-leaveout/scripts/compare-errors.sh b/workflows/cp-leaveout/scripts/compare-errors.sh new file mode 100755 index 00000000..985ba3e1 --- /dev/null +++ b/workflows/cp-leaveout/scripts/compare-errors.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -eu + +# COMPARE ERRORS SH +# Compare errors from $DIR1/node-info.pkl and $DIR2/node-info.pkl +# See compare-errors.py + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +set -x +python3 -u $THIS/compare-errors.py $* diff --git a/workflows/cp-leaveout/scripts/compare-losses.py b/workflows/cp-leaveout/scripts/compare-losses.py new file mode 100644 index 00000000..78294b0a --- /dev/null +++ b/workflows/cp-leaveout/scripts/compare-losses.py @@ -0,0 +1,32 @@ +# COMPARE LOSSES PY + +# Input: Provide two experiment directories +# Output: Stream of NODE_ID LOSS1 LOSS2 + +import argparse +import pickle + +parser = argparse.ArgumentParser(description="Parse all log files") +parser.add_argument("directory1", help="The 1st experiment directory (EXPID)") +parser.add_argument("directory2", help="The 2nd experiment directory (EXPID)") + +args = parser.parse_args() + +# logging.basicConfig(level=logging.DEBUG, format="%(message)s") +# logger = logging.getLogger("extract_node_info") + +node_pkl_1 = args.directory1 + "/node-info.pkl" +node_pkl_2 = args.directory2 + "/node-info.pkl" + +with open(node_pkl_1, "rb") as fp: + nodes_1 = pickle.load(fp) +with open(node_pkl_2, "rb") as fp: + nodes_2 = pickle.load(fp) +# print("%i %i" % (len(nodes_1), len(nodes_2))) + +count = 1 +for node_id in nodes_2: + loss_1 = nodes_1[node_id].val_loss + loss_2 = nodes_2[node_id].val_loss + print("%2i %s %8.7f %8.7f" % (count, node_id, loss_1, loss_2)) + count += 1 diff --git a/workflows/cp-leaveout/scripts/compare-losses.sh b/workflows/cp-leaveout/scripts/compare-losses.sh new file mode 100755 index 00000000..7636c09b --- /dev/null +++ b/workflows/cp-leaveout/scripts/compare-losses.sh @@ -0,0 +1,30 @@ +#!/bin/bash +set -eu + +# COMPARE LOSSES SH +# Compare losses from $DIR1/node-info.pkl and $DIR2/node-info.pkl + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide 2 experiment DIRs (e.g., .../experiments/X042)!" \ + DIR1 DIR2 - ${*} + +for DIR in $DIR1 $DIR2 +do + if [[ ! -d $DIR ]] + then + echo "$0: Given experiment directory does not exist: $DIR" + exit 1 + fi +done + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +set -x +python3 -u $THIS/compare-losses.py $DIR1 $DIR2 > compared-losses.txt +awk '{print $3, $4}' < compared-losses.txt > compared-losses.data +sort -n compared-losses.data | nl --number-width=2 \ + > compared-losses-sorted.data diff --git a/workflows/cp-leaveout/scripts/compute-node-count.py b/workflows/cp-leaveout/scripts/compute-node-count.py index 2a7ff86d..1379f4dc 100644 --- a/workflows/cp-leaveout/scripts/compute-node-count.py +++ b/workflows/cp-leaveout/scripts/compute-node-count.py @@ -1,4 +1,3 @@ - # COMPUTE NODE COUNT PY # Simply calculate the node count @@ -7,7 +6,7 @@ S = 5 total = 0 -current = 1 # Number of nodes in current stage +current = 1 # Number of nodes in current stage for stage in range(0, S): current *= 4 print("%i: current: %4i" % (stage, current)) diff --git a/workflows/cp-leaveout/scripts/count-lines.awk b/workflows/cp-leaveout/scripts/count-lines.awk new file mode 100755 index 00000000..cfd5e05f --- /dev/null +++ b/workflows/cp-leaveout/scripts/count-lines.awk @@ -0,0 +1,25 @@ +#!/usr/bin/awk -f + +# COUNT LINES AWK +# Like cat, but counts lines and time + +BEGIN { + t0 = systime() + count = 0 +} + +{ + print $0 + count++ +} + +END { + t1 = systime() + duration = t1 - t0 + if (duration == 0) + rate = "infinity" + else + rate = count/duration + print "count:", count, "in", duration, "seconds. rate:", rate \ + > "/dev/stderr" +} diff --git a/workflows/cp-leaveout/scripts/cp-subtree.sh b/workflows/cp-leaveout/scripts/cp-subtree.sh new file mode 100755 index 00000000..364d821d --- /dev/null +++ b/workflows/cp-leaveout/scripts/cp-subtree.sh @@ -0,0 +1,51 @@ +#!/bin/zsh -f +set -eu + +# CP SUBTREE SH +# Make a subset of the existing experiment tree +# Selects N leaf nodes at stage STAGE +# Copies those leaf nodes and their parents into output directory OUT + +THIS=$( realpath $( dirname $0 ) ) + +SUPERVISOR=$( realpath $THIS/../../.. ) +alias shopt=: +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide DIR OUT STAGE N" \ + DIR OUT STAGE N - ${*} + +EXP_ID=${DIR:t} + +mkdir -pv $OUT/$EXP_ID/run +OUT=$( realpath $OUT ) + +cd $DIR/run + +# Make pattern for grep on directory names +P=() +PATTERN="" + +# Don't forget stage 0 == "1." +repeat $(( STAGE + 1 )) P+=( . ) +# Join array P with separator . (dot) +PATTERN="^${(j:.:)P}\$" + +# Pull out N random directories that match pattern +NODES=( $( ls | grep "$PATTERN" | shuf -n $N ) ) + +for NODE in $NODES +do + if [[ -d $OUT/$NODE ]] continue + print "copy: $NODE ..." + cp -r $NODE $OUT/$EXP_ID/run + while true + do + # Parent node: chop off last 2 characters + NODE=${NODE[1,-3]} + if (( ${#NODE} == 1 )) break + if [[ -d $OUT/$NODE ]] break + print "copy: $NODE ..." + cp -r $NODE $OUT/$EXP_ID/run + done +done diff --git a/workflows/cp-leaveout/scripts/data-size.py b/workflows/cp-leaveout/scripts/data-size.py new file mode 100644 index 00000000..e7235096 --- /dev/null +++ b/workflows/cp-leaveout/scripts/data-size.py @@ -0,0 +1,83 @@ +# DATA SIZE PY +# Get the training data size from the file + +import argparse +import logging +import os +import sys + +import pandas as pd +from utils import fail + +parser = argparse.ArgumentParser(description="Extract the data size") +parser.add_argument("input", help="The training file") +args = parser.parse_args() + +print("data-size.py: opening '%s' ..." % args.input) + +_, ext = os.path.splitext(args.input) +if ext == ".h5" or ext == ".hdf5": + store = pd.HDFStore(args.input, "r") + # df = store.get("df") + df_y_train = store.get("y_train") + print("train " + str(df_y_train.shape)) + df_y_val = store.get("y_val") + print("val " + str(df_y_val.shape)) + df_x_train_0 = store.get("x_train_0") + print("x0 " + str(df_x_train_0.shape)) + df_x_train_1 = store.get("x_train_1") + print("x1 " + str(df_x_train_1.shape)) + + print(df_x_train_0.index) + + clms = df_x_train_0.columns + print(clms) + for clm in clms: + print(df_x_train_0.at[2, clm]) + # print(df_x_train_1.columns) + + store.close() + +elif ext == ".feather": + print("read feather " + str(args.input)) + df = pd.read_feather(args.input).fillna(0) + print(df.shape) + print(df.dtypes) + print(str(df["CELL"])) + C = {} + for s in df["CELL"]: + C[s] = "" + D = {} + for s in df["DRUG"]: + D[s] = "" + print("df.columns: " + str(df.columns)) + print("df.index: " + str(df.index)) + print("len(df): " + str(len(df))) + print("len(C): " + str(len(C))) + print("len(D): " + str(len(D))) + print("len(AUC): " + str(len(df["AUC"]))) + + # print(str(df["CELL"][0:9])) + # print(str(type(df["CELL"][0]))) + +print("data-size: OK.") + +# total size: (529940, 6215) + +# store = pd.HDFStore(args.input, "r", complevel=9, complib="blosc:snappy") +# print(str(store)) + +# print(store.get("y_val")) + +# f = h5py.File(args.file, "r") + +# # print(f.name) + +# K = list(f.keys()) +# print(K) +# for g in K: +# print(g) +# if type(f[g]) == h5py._hl.group.Group: +# D = f[g].keys() +# print(list(D)) +# print("") diff --git a/workflows/cp-leaveout/scripts/data-size.sh b/workflows/cp-leaveout/scripts/data-size.sh new file mode 100755 index 00000000..8724af93 --- /dev/null +++ b/workflows/cp-leaveout/scripts/data-size.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -eu + +# DATA SIZE SH +# See data-size.py + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) +source $SUPERVISOR/workflows/common/sh/utils.sh + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +set -x +python3 $THIS/data-size.py $* diff --git a/workflows/cp-leaveout/scripts/describe-node.py b/workflows/cp-leaveout/scripts/describe-node.py new file mode 100755 index 00000000..e1a2d2d7 --- /dev/null +++ b/workflows/cp-leaveout/scripts/describe-node.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python + +# DESCRIBE NODE PY +# + +import argparse +import json + +parser = argparse.ArgumentParser() +parser.add_argument("plan", type=str, help="Plan data file") +parser.add_argument("node", type=str, help='The node e.g. "1.2.3"') +args = parser.parse_args() + +try: + with open(args.plan) as fp: + J = json.load(fp) +except Exception as e: + print("could not read JSON in file: %s\n" % args.plan + str(e)) + exit(1) + +for node in J.keys(): + if len(node) == 13: + # print(node) + # print(len(J[node]["train"])) + # print(J[node]["train"]) + for item in J[node]["train"]: + # print(item) + # print(item["cell"]) + print(len(item["cell"])) + print("") + # exit() + # print(str(J[args.node]["train"])) diff --git a/workflows/cp-leaveout/scripts/distill-holdout-errors.pl b/workflows/cp-leaveout/scripts/distill-holdout-errors.pl new file mode 100644 index 00000000..3a71e02a --- /dev/null +++ b/workflows/cp-leaveout/scripts/distill-holdout-errors.pl @@ -0,0 +1,52 @@ + +# DISTILL HOLDOUT ERRORS PL +# Original holdout error plotting scripts from Brettin +# Slack #cp-leaveout 2020-07-24 +# Uses stdin/stdout +# Input: holdout-errors.txt from extract-holdout-errors +# Output: Plottable TSV file for plot-holdout-errors.py + +$stages = uc(shift @ARGV); +$class = uc(shift @ARGV); +# Select error type for this run +# (index is the column in the data after removing text tokens): +if ($class eq "MSE") {$idx=1} +elsif ($class eq "MAE") {$idx=2} +elsif ($class eq "R2" ) {$idx=3} +else {die "usage: $0 MSE|MAE|R2"} + +while(<>){ + chomp; + # Remove readability tokens: + s/mse://; + s/mae://; + s/r2://; + # Split on WS: + @a=split/\s+/; + # h: The big Perl hash of all the data + # Maps node ID to the selected error type value: + $h{$a[0]}=$a[$idx]; +} + +# Suppresses a warning about the ~~ operator below: +use experimental 'smartmatch'; + +# Plot one line for each "leaf" node - a node ID with no children +foreach $id (sort keys %h) { + # Loop if there are any children of this node in the hash + if (/$id\./ ~~ %h) { next; } + + # Construct a line for the output TSV via prepend: + # Gets the parent ids for each id (drops 2 trailing chars) + # until the id is too short + @line = (); + for ( ; length $id > 2 ; $id = substr $id, 0, -2) { + unshift(@line, "$h{$id}\t"); + } + # Fill in any missing stages (pandas can handle a blank value): + while (scalar @line < $stages) { + push(@line, "\t"); + } + push(@line, $class); + print(@line, "\n"); +} diff --git a/workflows/cp-leaveout/scripts/epoch-count.sh b/workflows/cp-leaveout/scripts/epoch-count.sh new file mode 100755 index 00000000..655099ed --- /dev/null +++ b/workflows/cp-leaveout/scripts/epoch-count.sh @@ -0,0 +1,49 @@ +#!/bin/bash +set -eu + +# EPOCH COUNT SH +# Report run progress in number of completed epochs + +THIS=$( readlink --canonicalize $( dirname $0 ) ) +CPLO=$( readlink --canonicalize $THIS/.. ) +SUPERVISOR=$( readlink --canonicalize $CPLO/../.. ) + +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + DIR - ${*} + +if ! [[ -d $DIR ]] +then + echo "Does not exist: $DIR" + exit 1 +fi + +EXPID=$( basename $DIR ) +JOBID=$( cat $DIR/jobid.txt ) +show EXPID JOBID + +# Must use TMPFILE to avoid subshell for shell variables +mkdir -pv /tmp/$USER +TMPFILE=/tmp/$USER/epoch-count-XXX.tmp + +EARLIES=0 +LOGS=( $( find $DIR -name python.log ) ) +TOTAL=${#LOGS[@]} +echo "epoch-count.sh: found $TOTAL logs ..." +for LOG in ${LOGS[@]} +do + echo -n "$LOG :: " + # Pull out the last "Epoch:" line, print only the number: + EPOCH=$( sed -n '/Epoch:/h;${g;s/.*Epoch: \([0-9]*\).*/\1/;p}' $LOG ) + if grep -q "stopping: early" $LOG + then + EARLY="EARLY" + (( EARLIES += 1 )) + else + EARLY="" + fi + echo $EPOCH $EARLY +done > $TMPFILE +cat $TMPFILE | nl | sort -r -n -k 4 | column -t +echo "earlies: $EARLIES / $TOTAL" diff --git a/workflows/cp-leaveout/scripts/epoch-status.sh b/workflows/cp-leaveout/scripts/epoch-status.sh new file mode 100755 index 00000000..838ddb0e --- /dev/null +++ b/workflows/cp-leaveout/scripts/epoch-status.sh @@ -0,0 +1,41 @@ +#!/bin/bash +set -eu + +# EPOCH STATUS SH +# Report epoch progress status for all python.logs + +THIS=$( readlink --canonicalize $( dirname $0 ) ) +CPLO=$( readlink --canonicalize $THIS/.. ) +SUPERVISOR=$( readlink --canonicalize $CPLO/../.. ) + +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + DIR - ${*} + +if ! [[ -d $DIR ]] +then + echo "Does not exist: $DIR" + exit 1 +fi + +EXPID=$( basename $DIR ) +JOBID=$( cat $DIR/jobid.txt ) +show EXPID JOBID + +LOGS=( $( find $DIR -name python.log ) ) +echo "epoch-count.sh: found ${#LOGS[@]} logs ..." + +COMPLETED=0 +for LOG in ${LOGS[@]} +do + if grep -q "EPOCHS COMPLETED" $LOG + then + (( COMPLETED = COMPLETED+1 )) + else + echo + echo $LOG + tail $LOG + fi +done +echo "COMPLETED: $COMPLETED" diff --git a/workflows/cp-leaveout/scripts/epoch-time.py b/workflows/cp-leaveout/scripts/epoch-time.py new file mode 100644 index 00000000..1021f8b2 --- /dev/null +++ b/workflows/cp-leaveout/scripts/epoch-time.py @@ -0,0 +1,68 @@ +# EPOCH TIME PY +# See epoch-time.sh + +import datetime +import sys + +# Main data structure: +# map from stage number to list of epoch times in seconds +stages = {} +for stage in range(1, 6 + 1): + stages[stage] = [] + +# Files processed: +progress = 0 +total = 0 + +node_current = "NONE" +stage_current = -1 +start_current = None + +while True: + + line = sys.stdin.readline() + + if len(line) == 0: + break # EOF + if len(line) == 1: + continue # Blank line + tokens = line.split() + + if tokens[0] == "epoch-time:": + if tokens[1] == "node": + node_current = tokens[2] + stage_current = int(len(node_current) / 2) + start_current = None + # print("node: " + node_current) + # print("stage: " + str(stage_current)) + progress += 1 + elif tokens[1] == "total": + total = int(tokens[2]) + else: + assert False + continue + + if tokens[2] == "UNO" and tokens[3] == "START": + # This is a Keras restart: Reset the timer + start_current = None + + if tokens[2] == "Epoch": + ts = tokens[0] + " " + tokens[1] + dt = datetime.datetime.strptime(ts, "%Y-%m-%d %H:%M:%S") + if start_current is None: + start_current = dt + continue + start = start_current.timestamp() + stop = dt.timestamp() + duration = stop - start + # print("epoch complete: " + str(duration)) + start_current = dt + stages[stage_current].append(duration) + +for stage in range(1, 6 + 1): + n = len(stages[stage]) + if n == 0: + avg = -1 + else: + avg = sum(stages[stage]) / n + print("stage %i count: %6i avg: %8.2f" % (stage, n, avg)) diff --git a/workflows/cp-leaveout/scripts/epoch-time.sh b/workflows/cp-leaveout/scripts/epoch-time.sh new file mode 100755 index 00000000..73ad058a --- /dev/null +++ b/workflows/cp-leaveout/scripts/epoch-time.sh @@ -0,0 +1,41 @@ +#!/bin/bash +set -eu + +# EPOCH TIME SH +# Report average time per epoch by stage + +THIS=$( readlink --canonicalize $( dirname $0 ) ) +CPLO=$( readlink --canonicalize $THIS/.. ) +SUPERVISOR=$( readlink --canonicalize $CPLO/../.. ) + +source $SUPERVISOR/workflows/common/sh/utils.sh + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + DIR - ${*} + +if ! [[ -d $DIR ]] +then + echo "Does not exist: $DIR" + exit 1 +fi + +EXPID=$( basename $DIR ) +JOBID=$( cat $DIR/jobid.txt ) +show EXPID JOBID + +NODES=( $( ls $DIR/run | head -10000 ) ) # +echo "epoch-time.sh: found ${#NODES[@]} nodes ..." +{ + echo "epoch-time: total ${#NODES[@]}" + for NODE in ${NODES[@]} + do + echo "epoch-time: node $NODE" + PYTHON_LOG=$DIR/run/$NODE/save/python.log + if [[ -e $PYTHON_LOG ]] + then + cat $PYTHON_LOG + fi + done +} | python $THIS/epoch-time.py diff --git a/workflows/cp-leaveout/scripts/extract-holdout-errors.awk b/workflows/cp-leaveout/scripts/extract-holdout-errors.awk new file mode 100644 index 00000000..24254de2 --- /dev/null +++ b/workflows/cp-leaveout/scripts/extract-holdout-errors.awk @@ -0,0 +1,29 @@ + +# EXTRACT HOLDOUT ERRORS AWK +# Finds error data in the python.log and reports a summary of it: + +# Input: +# 2020-07-07 14:38:50 Comparing y_true and y_pred: +# 2020-07-07 14:38:50 mse: 0.0063 +# 2020-07-07 14:38:50 mae: 0.0541 +# 2020-07-07 14:38:50 r2: 0.7352 +# 2020-07-07 14:38:50 corr: 0.8590 + +# Output: +# 1.1 mse: 0.0063 mae: 0.0538 r2: 0.7322 +# 1.1.1 mse: 0.0053 mae: 0.0492 r2: 0.7745 +# 1.1.1.1 mse: 0.0050 mae: 0.0480 r2: 0.7864 +# 1.1.1.1.1 mse: 0.0050 mae: 0.0473 r2: 0.7900 +# 1.1.1.1.1.1 mse: 0.0049 mae: 0.0469 r2: 0.7930 +# 1.1.1.1.1.2 mse: 0.0049 mae: 0.0470 r2: 0.7930 + +$3 == "Comparing" { + getline + mse = $3 " " $4 + getline + mae = $3 " " $4 + getline + r2 = $3 " " $4 + printf "%-14s %s %s %s\n", node, mse, mae, r2 + exit +} diff --git a/workflows/cp-leaveout/scripts/extract-holdout-errors.sh b/workflows/cp-leaveout/scripts/extract-holdout-errors.sh new file mode 100755 index 00000000..c3ae51e8 --- /dev/null +++ b/workflows/cp-leaveout/scripts/extract-holdout-errors.sh @@ -0,0 +1,44 @@ +#!/bin/bash +set -eu + +# EXTRACT HOLDOUT ERRORS SH +# Extract holdout error data from all python.logs +# in given experiment directory +# Provide an experiment directory DIR +# Creates $DIR/holdout-errors.txt +# See extract-holdout-errors.awk for file formats + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + DIR - ${*} + +if [[ ! -d $DIR ]] +then + echo "$0: Given experiment directory does not exist: $DIR" + exit 1 +fi + +EXTRACT_HOLDOUT_ERRORS_AWK=$THIS/extract-holdout-errors.awk + +# Missing python.logs (usually due to no data): +MISSING=0 +NODES=( $( ls $DIR/run ) ) +# set -x +echo "NODES: ${#NODES[@]}" +# echo ${NODES[@]} +for NODE in ${NODES[@]} +do + LOG=$DIR/run/$NODE/save/python.log + if [[ -r $LOG ]] + then + awk -f $EXTRACT_HOLDOUT_ERRORS_AWK -v node=$NODE < $LOG + else + MISSING=$(( MISSING + 1 )) + fi +done > $DIR/holdout-errors.txt + +echo "Missing python.logs: $MISSING" diff --git a/workflows/cp-leaveout/scripts/extract-holdout-errors.test b/workflows/cp-leaveout/scripts/extract-holdout-errors.test new file mode 100644 index 00000000..c7c1c637 --- /dev/null +++ b/workflows/cp-leaveout/scripts/extract-holdout-errors.test @@ -0,0 +1,9 @@ +2020-07-07 14:34:19 [Epoch: 48] loss: 0.004852, lr: 0.000012, mae: 0.048262, r2: -0.008181, val_loss: 0.008754, val_mae: 0.064672, val_r2: -0.532295 +2020-07-07 14:34:20 Epoch 49: lr=1.25e-05 +2020-07-07 14:38:27 [Epoch: 49] loss: 0.004851, lr: 0.000012, mae: 0.048300, r2: -0.012895, val_loss: 0.008730, val_mae: 0.064673, val_r2: -0.535607 +2020-07-07 14:38:50 Comparing y_true and y_pred: +2020-07-07 14:38:50 mse: 0.0063 +2020-07-07 14:38:50 mae: 0.0541 +2020-07-07 14:38:50 r2: 0.7352 +2020-07-07 14:38:50 corr: 0.8590 +2020-07-07 14:40:24 Cache parameter file does not exist: cache/top6_auc.params.json diff --git a/workflows/cp-leaveout/scripts/extract-node-info.py b/workflows/cp-leaveout/scripts/extract-node-info.py index 023c53f4..55c1fc0f 100644 --- a/workflows/cp-leaveout/scripts/extract-node-info.py +++ b/workflows/cp-leaveout/scripts/extract-node-info.py @@ -1,4 +1,3 @@ - # EXTRACT NODE INFO PY # Input: Provide an experiment directory @@ -7,26 +6,30 @@ # Use print-node-info to print the node info # See Node.py for the data structure -import argparse, logging, os, pickle, sys +import argparse +import logging +import os +import pickle -from utils import fail from Node import Node +from utils import fail -parser = argparse.ArgumentParser(description='Parse all log files') -parser.add_argument('directory', - help='The experiment directory (EXPID)') +parser = argparse.ArgumentParser(description="Parse all log files") +parser.add_argument("directory", help="The experiment directory (EXPID)") args = parser.parse_args() log_list = args.directory + "/log-list.txt" node_pkl = args.directory + "/node-info.pkl" -logging.basicConfig(level=logging.WARN, format="%(message)s") +logging.basicConfig(level=logging.INFO, format="%(message)s") +logger = logging.getLogger("extract_node_info") + def read_log_filenames(log_list): result = [] count = 0 - limit = 2000 # Reduce this for debugging + limit = 5000 # Reduce this for debugging try: with open(log_list) as fp: for line in fp.readlines(): @@ -38,62 +41,152 @@ def read_log_filenames(log_list): line = line.strip() result.append(line) except IOError as e: - abort(e, os.EX_IOERR, "Could not read: " + log_list) + fail(e, os.EX_IOERR, "Could not read: " + log_list) return result + def parse_logs(log_files): - # Dict mapping Node id to Node for all complete Nodes + # Dict mapping Node id to Node for all complete Nodes: nodes = {} - logging.warning("Opening %i log files..." % len(log_files)) + logger.warning("Opening %i files..." % len(log_files)) try: + total = len(log_files) + index = 0 for log_file in log_files: - logging.info("Opening: " + log_file) + progress = "%4i/%4i (%2.f%%)" % (index, total, + 100.0 * index / total) + logger.info("Opening: %12s %s" % (progress, log_file)) with open(log_file) as fp: parse_log(fp, nodes) + index += 1 except IOError as e: - abort(e, os.EX_IOERR, "Could not read: " + log_file) + fail(e, os.EX_IOERR, "Could not read: " + log_file) return nodes + def parse_log(log_fp, nodes): nodes_found = 0 node_current = None + build_df = None + while True: line = log_fp.readline() + # print(line) if line == "": break - if "PARAM UPDATE START" in line: - node_current = Node() - node_current.parse_date_start(line) - if "MODEL RUNNER DEBUG node =" in line: - tokens = line.split() - node_id = tokens[-1].strip() - node_current.set_id(node_id) - elif "MODEL RUNNER DEBUG epochs =" in line: - node_current.parse_epochs(line) - elif Node.training_done in line: - node_current.parse_training_done(line) - elif "early stopping" in line: - if node_current != None: - # TensorFlow may report early stopping even if at max epochs: - node_current.stop_early() + if line.startswith("data_setup.pre_run"): + if "node:" in line: + tokens = line.split() + node_id = tokens[-2].strip() + node_current = get_node(nodes, node_id, logger) elif "DONE: run_id" in line: - node_current.parse_date_stop(line) - if node_current != None and node_current.complete: - # Store a complete Node - nodes[node_current.id] = node_current + # This is also a MODEL RUNNER line, + # but could be DEBUG or INFO + # (should be INFO in future) + if node_current is None: + # Restarted node with no epochs remaining: + continue + trace("RUN DONE.") + node_current.parse_date_stop(line, logger) + elif "MODEL RUNNER" in line: + # print(line.strip()) + if "PARAM UPDATE START" in line: + node_current.parse_date_start(line) + if " epochs =" in line: + if node_current is None: + # Restarted node with no epochs remaining: + continue + node_current.parse_epochs(line, logger) + elif line.startswith("data_setup: build_dataframe() OK"): + build_df = parse_build_df(line, logger) + elif line.startswith("Loaded from initial_weights"): + node_current.parse_load_initial(line, logger) + elif line.startswith("Epoch ") and "/" in line: + node_current.parse_epoch_status(line, logger) + elif line.startswith("Current "): + node_current.parse_current_time(line, logger) + elif Node.training_done in line and "ETA:" not in line: + node_current.parse_training_done(line, logger) + elif line.startswith("model wrote:"): + node_current.parse_model_write(line, logger) + elif "topN_NoDataException" in line: + node_current.has_data = False + elif "early stopping" in line: + if not "setting early stopping patience" in line: + if node_current is not None: + # TensorFlow may report early stopping even if at max epochs: + node_current.stop_early() + if node_current is not None and node_current.complete: + # Store a complete Node in global dict nodes + trace("node done.") + # find_val_data(node_current) # old format? + parse_python_log(node_current) + # print(Node.str_table(node_current)) nodes_found += 1 node_current = None + # exit(0) + + logger.info("Found %i nodes in log." % nodes_found) + + +def get_node(nodes, node_id, logger): + + if "'" in node_id: + node_id = node_id.replace("'", "") + if node_id not in nodes: + trace("NEW: " + node_id) + result = Node(logger=logger) + result.set_id(node_id, logger) + nodes[node_id] = result + else: + trace("lookup: " + node_id) + result = nodes[node_id] + result.new_segment() + result.complete = False + return result + + +def parse_build_df(line, logger=None): + tokens = line.split() + assert len(tokens) == 6 + global build_df + build_df = float(tokens[4]) + # logger.info("build_df: %0.2f" % build_df) + return build_df + + +def trace(message): + logger.log(level=logging.DEBUG - 5, msg=message) + + +# def find_val_data(node): +# python_log = args.directory + "/run/%s/save/python.log" % node.id +# if not os.path.exists(python_log): +# return +# with open(python_log) as fp: +# node.parse_val_data(fp) +# if node.val_data == None: +# logger.fatal("Could not find val data for node: " + node.id) + + +def parse_python_log(node): + python_log = args.directory + "/run/%s/save/python.log" % node.id + if not os.path.exists(python_log): + return + with open(python_log) as fp: + node.parse_python_log(fp) + if node.mse is None: + logger.fatal("Could not find error data for node: " + node.id) - logging.info("Found %i nodes in log." % nodes_found) # List of log file names log_files = read_log_filenames(log_list) # Dict mapping Node id to Node for all complete Nodes nodes = parse_logs(log_files) -logging.warning("Found %i nodes in total." % len(nodes)) +logger.warning("Found %i nodes in total." % len(nodes)) with open(node_pkl, "wb") as fp: pickle.dump(nodes, fp) -logging.warning("Wrote %s ." % node_pkl) +logger.warning("Wrote pickle: %s ." % node_pkl) diff --git a/workflows/cp-leaveout/scripts/extract-node-info.sh b/workflows/cp-leaveout/scripts/extract-node-info.sh index d7b75eb0..c4243865 100755 --- a/workflows/cp-leaveout/scripts/extract-node-info.sh +++ b/workflows/cp-leaveout/scripts/extract-node-info.sh @@ -3,7 +3,8 @@ set -eu # EXTRACT NODE INFO SH # Extract all data from all logs in given experiment directory -# Provide an experiment directory +# Provide an experiment directory DIR +# Creates $DIR/node-info.pkl THIS=$( readlink --canonicalize $( dirname $0 ) ) @@ -19,26 +20,26 @@ then exit 1 fi - -# # The stdout from the workflow (read by this script) -# OUTPUT=$DIR/output.txt -# # The output of this script, a plottable file -# SUMMARY=$DIR/summary.txt - # Put all matching file names in this file, one per line # (this could contain thousands of entries, too long for command line): LOG_LIST=$DIR/log-list.txt +shopt -s nullglob # Ignore empty globs RESTARTS=( $DIR/restarts/* ) -shopt -s nullglob # Ignore empty globs +for RESTART in ${RESTARTS[@]} +do + $SUPERVISOR/scripts/shrink-logs.sh $RESTART/out +done +$SUPERVISOR/scripts/shrink-logs.sh $DIR + { for RESTART in ${RESTARTS[@]} do - echo $RESTART/out/out-*.txt + find $RESTART/out -name summary- done - echo $DIR/out/out-*.txt -} | fmt -w 1 > $LOG_LIST + find $DIR/out -name summary- +} > $LOG_LIST # | fmt -w 1 export PYTHONPATH+=:$SUPERVISOR/workflows/common/python diff --git a/workflows/cp-leaveout/scripts/extract-stats.sh b/workflows/cp-leaveout/scripts/extract-stats.sh index 707ff384..11ba980e 100755 --- a/workflows/cp-leaveout/scripts/extract-stats.sh +++ b/workflows/cp-leaveout/scripts/extract-stats.sh @@ -38,7 +38,7 @@ FORMAT="%-6s %-10s %-8s %-8s %-8s %-8s" sed "/Current time/ {s/Current time \.\.\.\.\(.*\)/\1/ ; h}; \$!d; x" \ $DIR/run/$RUN/model.log ) ) - if [ ${#STATS[@]} -gt 0 ] + if (( ${#STATS[@]} > 0 )) then printf " $FORMAT\n" $STAGE $RUN ${STATS[@]} fi diff --git a/workflows/cp-leaveout/scripts/find-loss-increases.py b/workflows/cp-leaveout/scripts/find-loss-increases.py index 08e2ef80..c7072500 100644 --- a/workflows/cp-leaveout/scripts/find-loss-increases.py +++ b/workflows/cp-leaveout/scripts/find-loss-increases.py @@ -1,4 +1,3 @@ - # FIND LOSS INCREASES PY # Brettin email 2019-12-18: @@ -6,115 +5,332 @@ # that when added to the training samples, # cause the performance of the node/model to decrease. -import argparse, os, pickle, sys +import argparse +import os +import pickle +import sys from Node import Node from utils import append, avg, fail STAGE_ANY = 0 -parser = argparse.ArgumentParser(description='Finds loss increases.') -parser.add_argument('directory', - help='The experiment directory (EXPID)') -parser.add_argument('--filename', '-f', - default='node-info', - help='Change the node pkl file name') -parser.add_argument('--stage', '-S', +parser = argparse.ArgumentParser(description="Finds loss increases.") +parser.add_argument("directory", help="The experiment directory (EXPID)") +parser.add_argument("--filename", + "-f", + default="node-info", + help="Change the node pkl file name") +parser.add_argument("--stage", + "-S", type=int, default=STAGE_ANY, - help='Select the stage') -parser.add_argument('--token', '-T', default=None, - help='User-readable naming token') + help="Select the stage") +parser.add_argument("--token", + "-T", + default=None, + help="User-readable naming token") args = parser.parse_args() if args.token == None: args.token = os.path.basename(args.directory) -node_pkl = args.directory + '/' + args.filename + '.pkl' +node_pkl = args.directory + "/" + args.filename + ".pkl" try: - with open(node_pkl, 'rb') as fp: + with open(node_pkl, "rb") as fp: # This is a dict ('node_id' -> Node) data = pickle.load(fp) except IOError as e: - fail(e, os.EX_IOERR, 'Could not read: ' + node_pkl) + fail(e, os.EX_IOERR, "Could not read: " + node_pkl) + +print("total nodes: %i" % len(data)) # Artificial nodes for comparison: -node_worst = Node("WORST") -node_worst.val_loss = 0 -node_best = Node("BEST") -node_best.val_loss = 1000 +# !! Updated upstream +node_loss_worst = Node("WORST") +node_loss_worst.loss = 0 +node_loss_best = Node("BEST") +node_loss_best.loss = 1000 + +# List of Nodes where loss increased: +increases_loss = [] +# Total Node count: +total = 0 +# Stage 5 Nodes +leaves = 0 +for node_id in data.keys(): + # print("node: " + node_id) + parent_id = node_id[0:-2] # '1.2.3' -> '1.2' + if len(parent_id) == 1: # stage=1 + continue + if parent_id not in data: + print("parent not found.") + continue + current = data[node_id] + parent = data[parent_id] + if current.stage == 5: + leaves += 1 + if not (args.stage == STAGE_ANY or args.stage == current.stage): + continue + current.loss_delta = current.loss - parent.loss + if current.loss_delta > 0: + increases_loss.append(current) + if current.val_loss > node_loss_worst.loss: + node_worst = current + if current.val_loss < node_loss_best.loss: + node_best = current + total += 1 + +fraction = 100.0 * len(increases_loss) / total +print("increases_loss/total = %i / %i (%02.f%%)" % + (len(increases_loss), total, fraction)) + +# Artificial nodes for comparison: +node_vl_worst = Node("WORST") +node_vl_worst.val_loss = 0 +node_vl_best = Node("BEST") +node_vl_best.val_loss = 1000 +# == +# val_loss: +node_worst_val_loss = Node("WORST VAL_LOSS") +node_worst_val_loss.val_loss = 0 +node_best_val_loss = Node("BEST VAL_LOSS") +node_best_val_loss.val_loss = 1000 +# loss: +node_worst_loss = Node("WORST LOSS") +node_worst_loss.loss = 0 +node_best_loss = Node("BEST LOSS") +node_best_loss.loss = 1000 +# !! Stashed changes if args.stage != STAGE_ANY: print("STAGE: %i" % args.stage) +# !! Updated upstream +leaves = 0 # stage 5 Nodes + # List of Nodes where val_loss increased: -increases = [] +increases_vl = [] # Total Node count: total = 0 for node_id in data.keys(): - parent_id = node_id[0:-2] # '1.2.3' -> '1.2' - if len(parent_id) == 1: # stage=1 + # print("node: " + node_id) + parent_id = node_id[0:-2] # '1.2.3' -> '1.2' + if len(parent_id) == 1: # stage=1 + continue + if parent_id not in data: + print("parent not found.") continue current = data[node_id] - parent = data[parent_id] + parent = data[parent_id] + if current.stage == 5: + leaves += 1 if not (args.stage == STAGE_ANY or args.stage == current.stage): continue current.val_loss_delta = current.val_loss - parent.val_loss if current.val_loss_delta > 0: - increases.append(current) - if current.val_loss > node_worst.val_loss: node_worst = current - if current.val_loss < node_best.val_loss: node_best = current + increases_vl.append(current) + if current.val_loss > node_vl_worst.val_loss: + node_worst = current + if current.val_loss < node_vl_best.val_loss: + node_best = current total += 1 -if total == 0: fail('No matching Nodes found!') -fraction = 100.0 * len(increases) / total -print('increases/total = %i / %i (%02.f%%)' % (len(increases), total, fraction)) +# == +def get_increases(): + # List of Nodes where loss increased: + global increases_loss + increases_loss = [] + # List of Nodes where val_loss increased: + global increases_val_loss + increases_val_loss = [] + + global node_worst_loss, node_worst_val_loss + global node_best_loss, node_best_val_loss + + # count of Nodes: + total = 0 + # count of stage 5 Nodes + leaves = 0 + # count of Nodes with missing parent + parents_missing = 0 + for node_id in data.keys(): + # print("node: " + node_id) + parent_id = node_id[0:-2] # '1.2.3' -> '1.2' + if len(parent_id) == 1: # stage=1 + continue + if parent_id not in data: + # print("parent not found.") + parents_missing += 1 + continue + current = data[node_id] + parent = data[parent_id] + if current.stage == 5: + leaves += 1 + if not (args.stage == STAGE_ANY or args.stage == current.stage): + continue + current.val_loss_delta = current.val_loss - parent.val_loss + current.loss_delta = current.loss - parent.loss + # Register increases: + if current.val_loss_delta > 0: + increases_val_loss.append(current) + if current.loss_delta > 0: + increases_loss.append(current) + # Update best/worst: + if current.loss > node_worst_loss.loss: + node_worst_loss = current + if current.loss < node_best_loss.loss: + node_best_loss = current + if current.val_loss > node_worst_val_loss.val_loss: + node_worst_val_loss = current + if current.val_loss < node_best_val_loss.val_loss: + node_best_val_loss = current + total += 1 + print("parents_missing: %i" % parents_missing) + return total, leaves + + +# total: count of Nodes +# leaves: count of stage 5 Nodes +total, leaves = get_increases() +# !! Stashed changes + +print("leaves: %i" % leaves) + +if total == 0: + fail("No matching Nodes found!") -file_increases = "increases-%s.data" % args.token -append(file_increases, "%i %5.1f" % (args.stage, fraction)) +# !! Updated upstream +fraction = 100.0 * len(increases_vl) / total +print("increases_vl/total = %i / %i (%02.f%%)" % + (len(increases_vl), total, fraction)) -print('worst val_loss: ' + str(node_worst)) -print('best val_loss: ' + str(node_best)) +file_increases_vl = "increases-vl-%s.data" % args.token +append(file_increases_vl, "%i %5.1f" % (args.stage, fraction)) +# == +fraction = 100.0 * len(increases_loss) / total +print("increases_loss/total = %i / %i (%02.f%%)" % + (len(increases_loss), total, fraction)) +filename = "increases-loss-%s.data" % args.token +append(filename, "%i %5.1f" % (args.stage, fraction)) -print('DELTAS:') +fraction = 100.0 * len(increases_val_loss) / total +print("increases_val_loss/total = %i / %i (%02.f%%)" % + (len(increases_val_loss), total, fraction)) +filename = "increases-val_loss-%s.data" % args.token +append(filename, "%i %5.1f" % (args.stage, fraction)) +# !! Stashed changes + +print("worst loss: " + str(node_worst_loss)) +print("best loss: " + str(node_best_loss)) +print("worst val_loss: " + str(node_worst_val_loss)) +print("best val_loss: " + str(node_best_val_loss)) + +exit() + +print("DELTAS:") + +increases_loss.sort(key=Node.get_loss_delta) +increases_val_loss.sort(key=Node.get_val_loss_delta) +# stopped_early = 0 +# for i in increases: +# # print('%f %-14s %r' % (i.val_loss_delta, i.id, i.stopped_early)) +# if i.stopped_early: stopped_early += 1 -increases.sort(key=Node.get_val_loss_delta) -stopped_early = 0 -for i in increases: - # print('%f %-14s %r' % (i.val_loss_delta, i.id, i.stopped_early)) - if i.stopped_early: stopped_early += 1 def print_delta(prefix, node): - print(prefix, str(node), 'delta: %f' % node.val_loss_delta) - -worst = increases[-1] -print_delta('worst: ', worst) - -n_01p = int(round(len(increases) / 100)) # Worst 1 percentile -if n_01p == 0: n_01p = 1 -worst_01p = increases[-n_01p] -print_delta('worst 1%:', worst_01p) - -n_10p = int(round(len(increases) / 10)) # Worst 10 percentile -if n_10p == 0: n_10p = 1 -worst_10p = increases[-n_10p] -print_delta('worst 10%:', worst_10p) - -print('increases that stopped early: %i' % stopped_early) - -values_increase = [] -values_val_loss = [] -for node in increases: - values_increase.append(node.get_val_loss_delta()) - values_val_loss.append(node.val_loss) -avg_increase = avg(values_increase) -avg_val_loss = avg(values_val_loss) -print('avg increase: %f' % avg_increase) -delta_ratio = 100.0 * avg_increase / avg_val_loss -print('avg increase fraction: %f' % delta_ratio) - -file_increase_deltas = "increase-deltas-%s.data" % args.token -append(file_increase_deltas, "%i %5.1f" % (args.stage, delta_ratio)) + print(prefix, str(node), "delta: %f" % node.val_loss_delta) + + +# worst = increases[-1] +# print_delta('worst: ', worst) + +# n_01p = int(round(len(increases) / 100)) # Worst 1 percentile +# if n_01p == 0: n_01p = 1 +# worst_01p = increases[-n_01p] +# print_delta('worst 1%:', worst_01p) + +# n_10p = int(round(len(increases) / 10)) # Worst 10 percentile +# if n_10p == 0: n_10p = 1 +# worst_10p = increases[-n_10p] +# print_delta('worst 10%:', worst_10p) + +# print('increases that stopped early: %i' % stopped_early) + +# values_increase = [] +# values_val_loss = [] + +# for node in increases: +# values_increase.append(node.get_val_loss_delta()) +# values_val_loss.append(node.val_loss) + +# avg_increase = avg(values_increase) +# avg_val_loss = avg(values_val_loss) +# print('avg increase: %f' % avg_increase) +# delta_ratio = 100.0 * avg_increase / avg_val_loss +# print('avg increase fraction: %f' % delta_ratio) + +# file_increase_deltas = "increase-deltas-%s.data" % args.token +# append(file_increase_deltas, "%i %5.1f" % (args.stage, delta_ratio)) + +# outliers_file = "outliers-%s.data" % args.token +# print("avg_increase", str(avg_increase)) +# print("avg_val_loss", str(avg_val_loss)) + + +def report_top_loss_deltas(): + print("%-2s %-12s %-8s %-8s %-8s %-8s" % + ("", "node", "loss", "parent", "delta", "ratio")) + increases_loss.sort(key=Node.get_loss_delta, reverse=True) + ratios = [] + index = 1 + for node in increases_loss: + parent = data[node.parent()] + ratio = node.get_loss_delta() / parent.loss + print("%2i %-12s %0.6f %0.6f %0.6f %0.6f" % + (index, node.id, node.loss, parent.loss, node.get_loss_delta(), + ratio)) + ratios.append(ratio) + index += 1 + ratios.sort() + + +def report_top_val_loss_deltas(increases_val_loss): + print("%-2s %-12s %-8s %-8s %-8s %-8s %-8s" % + ("", "node", "val_loss", "parent", "delta", "ratio", "val_data")) + increases_val_loss.sort(key=Node.get_val_loss_delta, reverse=True) + ratios = [] + index = 1 + for node in increases_val_loss: + parent = data[node.parent()] + ratio = node.get_val_loss_delta() / parent.loss + print("%2i %-12s %0.6f %0.6f %0.6f %0.6f %8i" % ( + index, + node.id, + node.val_loss, + parent.val_loss, + node.get_val_loss_delta(), + ratio, + node.val_data, + )) + ratios.append(ratio) + index += 1 + ratios.sort() + + +report_top_val_loss_deltas(increases_val_loss) + +# with open(outliers_file, "w") as fp: +# i = 0 +# for ratio in ratios: +# fp.write("%4i %0.7f\n" % (i, ratio)) +# i += 1 + +# with open(outliers_file, "w") as fp: +# i = 0 +# for ratio in ratios: +# fp.write("%4i %0.7f\n" % (i, ratio)) +# i += 1 diff --git a/workflows/cp-leaveout/scripts/leaf-stats.py b/workflows/cp-leaveout/scripts/leaf-stats.py new file mode 100644 index 00000000..2900e510 --- /dev/null +++ b/workflows/cp-leaveout/scripts/leaf-stats.py @@ -0,0 +1,172 @@ +# LEAF STATS PY + +import argparse + +import pandas as pd +import utils + +parser = argparse.ArgumentParser(description="Print leaf stats") +parser.add_argument("directory", help="The experiment directory (EXPID)") +parser.add_argument("list", help="The list of nodes to process") + +args = parser.parse_args() + +# Map from node "1.1.1.1.2.3" to cell line "CCLE.KMS11" +nodes = {} + +with open(args.list, "r") as fp: + while True: + line = fp.readline() + if len(line) == 0: + break + tokens = line.split() + node = tokens[0] + cell = tokens[1] + nodes[node] = cell + +columns = [ + "CELL", + "NODE", + "POINTS", + "EPOCHS", + "MAE", + "R2", + "VAL_LOSS", + "EARLY", + "HO_MSE", + "HO_MAE", + "HO_R2", +] + +df = pd.DataFrame(columns=columns) + + +class MatcherPoints(utils.Matcher): + + def __init__(self): + super(MatcherPoints, self).__init__(".*Data points per epoch.*") + self.reset() + + def run(self, line): + tokens = line.split() + # Remove trailing comma: + self.points = tokens[11][0:-1] + + def reset(self): + self.points = 0 + + +class MatcherStats(utils.Matcher): + + def __init__(self): + super(MatcherStats, self).__init__(".*loss:.*") + self.reset() + + def run(self, line): + tokens = line.split() + # Remove trailing bracket or comma: + self.epochs = tokens[3][0:-1] + self.mae = tokens[7][0:-1] + self.r2 = tokens[9][0:-1] + self.val_loss = tokens[11][0:-1] + + def reset(self): + self.epochs = 0 + self.mae = 0 + self.r2 = 0 + self.val_loss = 0 + + +class MatcherEarly(utils.Matcher): + + def __init__(self): + super(MatcherEarly, self).__init__(".*stopping: early.*") + self.reset() + + def run(self, line): + self.early = "1" + + def reset(self): + self.early = "0" + + +class MatcherHoldoutMSE(utils.Matcher): + + def __init__(self): + super(MatcherHoldoutMSE, self).__init__(".* mse:.*") + self.reset() + + def run(self, line): + tokens = line.split() + self.ho_mse = tokens[3] + + def reset(self): + self.ho_mse = "0" + + +class MatcherHoldoutMAE(utils.Matcher): + + def __init__(self): + super(MatcherHoldoutMAE, self).__init__(".* mae:.*") + self.reset() + + def run(self, line): + tokens = line.split() + self.ho_mae = tokens[3] + + def reset(self): + self.ho_mae = "0" + + +class MatcherHoldoutR2(utils.Matcher): + + def __init__(self): + super(MatcherHoldoutR2, self).__init__(".* r2:.*") + self.reset() + + def run(self, line): + tokens = line.split() + self.ho_r2 = tokens[3] + + def reset(self): + self.ho_r2 = "0" + + +matcherPoints = MatcherPoints() +matcherStats = MatcherStats() +matcherEarly = MatcherEarly() +matcherHO_MSE = MatcherHoldoutMSE() +matcherHO_MAE = MatcherHoldoutMAE() +matcherHO_R2 = MatcherHoldoutR2() +grepper = utils.Grepper([ + matcherPoints, + matcherStats, + matcherEarly, + matcherHO_MSE, + matcherHO_MAE, + matcherHO_R2, +]) + +for node in nodes: + cell = nodes[node] + log = f"{args.directory}/run/{node}/save/python.log" + grepper.grep(log) + newrow = pd.DataFrame({ + "CELL": [cell], + "NODE": [node], + "POINTS": [matcherPoints.points], + "EPOCHS": [matcherStats.epochs], + "MAE": [matcherStats.mae], + "R2": [matcherStats.r2], + "VAL_LOSS": [matcherStats.val_loss], + "EARLY": [matcherEarly.early], + "HO_MSE": [matcherHO_MSE.ho_mse], + "HO_MAE": [matcherHO_MAE.ho_mae], + "HO_R2": [matcherHO_R2.ho_r2], + }) + df = pd.concat([df, newrow], ignore_index=True) + grepper.reset() + +from tabulate import tabulate + +print(tabulate(df, headers="keys", tablefmt="plain")) diff --git a/workflows/cp-leaveout/scripts/leaf-stats.sh b/workflows/cp-leaveout/scripts/leaf-stats.sh new file mode 100755 index 00000000..3ef394a9 --- /dev/null +++ b/workflows/cp-leaveout/scripts/leaf-stats.sh @@ -0,0 +1,26 @@ +#!/bin/bash +set -eu + +# LEAF STATS +# Report stats for given nodes +# LIST: A file containing a simple per-line list of nodes, +# e.g., "1.1.2\n2.3.1\n4.1.1.1\n" + +THIS=$( realpath $( dirname $0 ) ) + +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an run DIR (e.g., .../experiments/X042/run/1.2.3)!" \ + -H "Provide a node list (from list-node-singles)" \ + DIR LIST - ${*} + +if [[ ! -d $DIR ]] +then + echo "$0: Given run directory does not exist: $DIR" + exit 1 +fi + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +python $THIS/leaf-stats.py $DIR $LIST diff --git a/workflows/cp-leaveout/scripts/list-node-singles.py b/workflows/cp-leaveout/scripts/list-node-singles.py new file mode 100755 index 00000000..268cfef2 --- /dev/null +++ b/workflows/cp-leaveout/scripts/list-node-singles.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python + +# LIST NODE SINGLES PY +# Extract the nodes from the JSON file with a single cell line +# report the node and cell line + +import argparse +import json + +parser = argparse.ArgumentParser() +parser.add_argument("plan", type=str, help="Plan data file") +args = parser.parse_args() + +try: + with open(args.plan) as fp: + J = json.load(fp) +except Exception as e: + print("could not read JSON in file: %s\n" % args.plan + str(e)) + exit(1) + +count = 0 + +for k in J.keys(): + entry = J[k] + if "val" not in entry: + # Root entry + continue + val = entry["val"] # A list + cells = val[0]["cell"] + if len(cells) == 1: + print(k + " " + cells[0]) + count += 1 + +# print(f"count: {count}") diff --git a/workflows/cp-leaveout/scripts/list-nodes.py b/workflows/cp-leaveout/scripts/list-nodes.py index 378cfcb1..2d3b85eb 100755 --- a/workflows/cp-leaveout/scripts/list-nodes.py +++ b/workflows/cp-leaveout/scripts/list-nodes.py @@ -3,10 +3,11 @@ # LIST NODES PY # Extract just the nodes from the JSON file for human inspection -import argparse, json +import argparse +import json parser = argparse.ArgumentParser() -parser.add_argument('plan', type=str, help='Plan data file') +parser.add_argument("plan", type=str, help="Plan data file") args = parser.parse_args() try: @@ -17,4 +18,4 @@ exit(1) for k in J.keys(): - print(k) + print(k) diff --git a/workflows/cp-leaveout/scripts/loss-histogram.py b/workflows/cp-leaveout/scripts/loss-histogram.py index 4e558eb3..a8968f78 100644 --- a/workflows/cp-leaveout/scripts/loss-histogram.py +++ b/workflows/cp-leaveout/scripts/loss-histogram.py @@ -1,4 +1,3 @@ - # LOSS HISTOGRAM # usage: python3 scripts/loss-histogram.py < $D/losses.txt diff --git a/workflows/cp-leaveout/scripts/mk-log-tar.sh b/workflows/cp-leaveout/scripts/mk-log-tar.sh new file mode 100755 index 00000000..46800f44 --- /dev/null +++ b/workflows/cp-leaveout/scripts/mk-log-tar.sh @@ -0,0 +1,28 @@ +#!/bin/bash +set -eu + +# MK LOG TAR SH +# Make a tarball with the important logs but not the big datasets + +THIS=$( realpath $( dirname $0 ) ) + +SUPERVISOR=$( realpath $THIS/../../.. ) +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + DIR - ${*} + +cd $DIR + +echo "find in $PWD ..." + +FILES=( $( find . -name python.log -or -name predicted.tsv ) ) + +echo "found ${#FILES[@]} files." +echo "running tar ..." + +TGZ=logs.tgz # PWD==DIR +time nice -n 19 tar cfz $TGZ ${FILES[@]} + +echo "created:" +ls -lh $TGZ diff --git a/workflows/cp-leaveout/scripts/node-times.py b/workflows/cp-leaveout/scripts/node-times.py new file mode 100644 index 00000000..3727b062 --- /dev/null +++ b/workflows/cp-leaveout/scripts/node-times.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python + +# NODE TIMES PY +# + +import argparse +import json +import pickle + +import Node + +parser = argparse.ArgumentParser() +parser.add_argument("dir", + type=str, + help="The directory with the node-info.pkl") +args = parser.parse_args() + +node_pkl = args.dir + "/" + "node-info.pkl" + +try: + with open(node_pkl, "rb") as fp: + D = pickle.load(fp) +except Exception as e: + print("could not read PKL file: %s\n" % node_pkl + str(e)) + exit(1) + +# Each a (time, value) record +# value=1 means job start ; value=0 means job stop +events = [] + +import datetime + +for node_id in D.keys(): + node = D[node_id] + fmt = "%Y-%m-%d %H:%M:%S" + start = datetime.datetime.strptime(node.date_start, fmt).timestamp() + stop = datetime.datetime.strptime(node.date_stop, fmt).timestamp() + events.append((start, 1)) + events.append((stop, -1)) + +events.sort() + +node_times_data = args.dir + "/node-times.data" +load = 0 + + +def scale(t): + offset = 1594305000 + return (t - offset) / 3600 + + +with open(node_times_data, "w") as fp: + if len(events) > 0: + for event in events: + fp.write("%12.1f %i\n" % (scale(event[0]), load)) + load = load + event[1] + fp.write("%12.1f %i\n" % (scale(event[0]), load)) diff --git a/workflows/cp-leaveout/scripts/node-times.sh b/workflows/cp-leaveout/scripts/node-times.sh new file mode 100755 index 00000000..5d0732f7 --- /dev/null +++ b/workflows/cp-leaveout/scripts/node-times.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -eu + +# NODE TIMES SH + +THIS=$( readlink --canonicalize $( dirname $0 ) ) +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +set -x +python3 -u $THIS/node-times.py $* diff --git a/workflows/cp-leaveout/scripts/plot-avgs.sh b/workflows/cp-leaveout/scripts/plot-avgs.sh new file mode 100755 index 00000000..586e2612 --- /dev/null +++ b/workflows/cp-leaveout/scripts/plot-avgs.sh @@ -0,0 +1,40 @@ +#!/bin/bash +set -eu + +# PLOT AVGS SH + +# Input: Provide an experiment directory DIR +# Output: Plots in PWD for data from times.data & vloss.data + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) +source $SUPERVISOR/workflows/common/sh/utils.sh + +# SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ +# DIR - ${*} + +# if [[ ! -d $DIR ]] +# then +# echo "$0: Given experiment directory does not exist: $DIR" +# exit 1 +# fi + +DIRS=( ${*} ) +TIMES="" +VLOSS="" +for DIR in ${DIRS[@]} +do + X=$( basename $DIR ) + + D=times-$X.data + cp -uv $DIR/times.data $D + TIMES+="$D " + + # D=vloss-$X.data + # cp $DIR/vloss.data $D + # VLOSS+="$D " +done + +jwplot stage-times.eps $THIS/stage-times.cfg $TIMES +# jwplot stage-vloss.eps $THIS/stage-vloss.cfg $VLOSS diff --git a/workflows/cp-leaveout/scripts/plot-holdout-errors.py b/workflows/cp-leaveout/scripts/plot-holdout-errors.py new file mode 100644 index 00000000..e70d33b0 --- /dev/null +++ b/workflows/cp-leaveout/scripts/plot-holdout-errors.py @@ -0,0 +1,37 @@ +# PLOT HOLDOUT ERRORS PY +# Plots holdout error data from distill-holdout-errors.pl + +import argparse + +import matplotlib.pyplot as plt +import pandas + +# This was removed from Pandas 1.6: +# Cf. https://stackoverflow.com/questions/54473018/where-is-pandas-tools +# from pandas.tools.plotting import parallel_coordinates +from pandas.plotting import parallel_coordinates + +parser = argparse.ArgumentParser(description="Make holdout errors plot") +parser.add_argument("stages", type=int, help="Number of stages") +parser.add_argument("file_input", help="The input errors TSV file") +parser.add_argument("file_output", help="The output PNG file") + +args = parser.parse_args() + +# names = [ 'Stage1','Stage2','Stage3','Stage4', 'Stage5', 'CLASS'] + +names = [] +for i in range(1, args.stages + 1): + names.append("Stage" + str(i)) +names.append("CLASS") + +print(str(names)) + +cpdata = pandas.read_csv(args.file_input, sep="\t", header=None, names=names) +p = parallel_coordinates(cpdata, + class_column="CLASS", + colormap=plt.get_cmap("Set2")) + +# fig = p.gcf() +fig = p.get_figure() +fig.savefig(args.file_output) diff --git a/workflows/cp-leaveout/scripts/plot-io-times.sh b/workflows/cp-leaveout/scripts/plot-io-times.sh new file mode 100755 index 00000000..8d1dbb7d --- /dev/null +++ b/workflows/cp-leaveout/scripts/plot-io-times.sh @@ -0,0 +1,53 @@ +#!/bin/bash +set -eu + +# PLOT IO TIMES SH + +# Input: Provide an experiment directory DIR +# Output: Plots in PWD for data + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) +source $SUPERVISOR/workflows/common/sh/utils.sh + +# SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ +# DIR - ${*} + +# if [[ ! -d $DIR ]] +# then +# echo "$0: Given experiment directory does not exist: $DIR" +# exit 1 +# fi + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +DIRS=( ${*} ) +BUILDS="" +LOADS="" +WRITES="" +for DIR in ${DIRS[@]} +do + # python $THIS/plot_io_times.py $DIR + + X=$( basename $DIR ) + D=builds-$X.data + cp -uv $DIR/builds.data $D + BUILDS+="$D " + + X=$( basename $DIR ) + D=loads-$X.data + cp -uv $DIR/loads.data $D + LOADS+="$D " + + X=$( basename $DIR ) + D=writes-$X.data + cp -uv $DIR/writes.data $D + WRITES+="$D " + +done + +set -x +jwplot builds.eps $THIS/stage-builds.cfg $BUILDS +jwplot writes.eps $THIS/stage-writes.cfg $WRITES +jwplot loads.eps $THIS/stage-loads.cfg $LOADS diff --git a/workflows/cp-leaveout/scripts/plot_io_times.py b/workflows/cp-leaveout/scripts/plot_io_times.py new file mode 100644 index 00000000..478306fa --- /dev/null +++ b/workflows/cp-leaveout/scripts/plot_io_times.py @@ -0,0 +1,54 @@ +# PLOT IO TIMES PY + +import argparse +import os +import pickle +import statistics + +from utils import fail + +parser = argparse.ArgumentParser(description="Plot I/O stats") +parser.add_argument("directory", help="The experiment directory (EXPID)") + +args = parser.parse_args() + +node_pkl = args.directory + "/node-info.pkl" + +try: + with open(node_pkl, "rb") as fp: + data = pickle.load(fp) +except IOError as e: + fail(e, os.EX_IOERR, "Could not read: " + node_pkl) + +builds = {1: [], 2: [], 3: [], 4: [], 5: []} +loads = {1: [], 2: [], 3: [], 4: [], 5: []} +writes = {1: [], 2: [], 3: [], 4: [], 5: []} + +# Print the node info! +for node in data.values(): + if node.stage == 6: + continue + if node.build_df is not None: + builds[node.stage].append(node.build_df) + if node.load_initial is not None: + loads[node.stage].append(node.load_initial) + if node.ckpt_writes is not None: + writes[node.stage] += list(node.ckpt_writes.values()) + +with open(args.directory + "/builds.data", "w") as fp: + for stage in builds.keys(): + fp.write("%i " % stage) + fp.write("%0.3f" % statistics.mean(builds[stage])) + fp.write(" # count = %i\n" % len(builds[stage])) + +with open(args.directory + "/loads.data", "w") as fp: + for stage in loads.keys(): + if stage == 1: + continue # stage 1 does not do a load + fp.write("%i " % stage) + fp.write("%0.3f\n" % statistics.mean(loads[stage])) + +with open(args.directory + "/writes.data", "w") as fp: + for stage in writes.keys(): + fp.write("%i " % stage) + fp.write("%0.3f\n" % statistics.mean(writes[stage])) diff --git a/workflows/cp-leaveout/scripts/print-node-info.py b/workflows/cp-leaveout/scripts/print-node-info.py index 63092d9a..54f3c2ac 100644 --- a/workflows/cp-leaveout/scripts/print-node-info.py +++ b/workflows/cp-leaveout/scripts/print-node-info.py @@ -1,27 +1,67 @@ - # PRINT NODE INFO PY -import argparse, os, pickle, sys +import argparse +import os +import pickle +import sys from Node import Node from utils import fail -parser = argparse.ArgumentParser(description='Print Node info stats') -parser.add_argument('directory', - help='The experiment directory (EXPID)') +parser = argparse.ArgumentParser(description="Print Node info stats") +parser.add_argument("--count", "-c", action="store_true", + help="Simply count the nodes") +parser.add_argument("directory", help="The experiment directory (EXPID)") +parser.add_argument("nodes", + default="", + nargs="*", + help="Nodes to print (optional, defaults to all)") args = parser.parse_args() node_pkl = args.directory + "/node-info.pkl" try: - with open(node_pkl, 'rb') as fp: + with open(node_pkl, "rb") as fp: data = pickle.load(fp) except IOError as e: fail(e, os.EX_IOERR, "Could not read: " + node_pkl) +# Raw data printing: +# print(str(args)) +# print(len(data)) # print(data) -for item in data.values(): - print(item.str_table()) -# print(len(data)) + +def print_all(data): + # Print the node info! + print("print_all") + count = 0 + earlies = 0 + for node in data.values(): + # print(node.id) + print(node.str_table()) + count += 1 + if node.stopped_early: + earlies += 1 + print("print-node-info: %i/%i runs stopped early." % (earlies, count)) + + +def print_selected(data, nodes): + for node_id in nodes: + try: + node = data[node_id] + except KeyError: + print("Could not find node: '%s'" % node_id) + exit(1) + print(node.str_table()) + + +if args.count: + print(len(data)) + exit(0) + +if args.nodes == "": + print_all(data) +else: + print_selected(data, args.nodes) diff --git a/workflows/cp-leaveout/scripts/print-node-info.sh b/workflows/cp-leaveout/scripts/print-node-info.sh index a9f89bf2..45c38fec 100755 --- a/workflows/cp-leaveout/scripts/print-node-info.sh +++ b/workflows/cp-leaveout/scripts/print-node-info.sh @@ -5,16 +5,12 @@ set -eu # Input: Provide an experiment directory # Output: Node information printed to screen (pipe this into less) +# See Node.str_table() for the output format THIS=$( readlink --canonicalize $( dirname $0 ) ) - SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) -source $SUPERVISOR/workflows/common/sh/utils.sh - -SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ - DIR - ${*} export PYTHONPATH+=:$SUPERVISOR/workflows/common/python set -x -python3 -u $THIS/print-node-info.py $DIR +python3 -u $THIS/print-node-info.py ${*} diff --git a/workflows/cp-leaveout/scripts/report-leaves.sh b/workflows/cp-leaveout/scripts/report-leaves.sh new file mode 100755 index 00000000..3bff44cf --- /dev/null +++ b/workflows/cp-leaveout/scripts/report-leaves.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -eu + +# REPORT LEAVES SH +# Report nodes with no children + +THIS=$( realpath $( dirname $0 ) ) +CPLO=$( realpath $THIS/.. ) +SUPERVISOR=$( realpath $CPLO/../.. ) + +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + DIR - ${*} + +if ! [[ -d $DIR ]] +then + echo "Does not exist: $DIR" + exit 1 +fi + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +set -x +python3 -u $THIS/report_leaves.py $DIR diff --git a/workflows/cp-leaveout/scripts/report-lrs.sh b/workflows/cp-leaveout/scripts/report-lrs.sh new file mode 100755 index 00000000..964b25e3 --- /dev/null +++ b/workflows/cp-leaveout/scripts/report-lrs.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -eu + +# REPORT LRS SH +# Report learning rates by Node + +THIS=$( realpath $( dirname $0 ) ) +CPLO=$( realpath $THIS/.. ) +SUPERVISOR=$( realpath $CPLO/../.. ) + +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + DIR - ${*} + +if ! [[ -d $DIR ]] +then + echo "Does not exist: $DIR" + exit 1 +fi + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +set -x +python3 -u $THIS/report_lrs.py $DIR diff --git a/workflows/cp-leaveout/scripts/report-stopping.sh b/workflows/cp-leaveout/scripts/report-stopping.sh new file mode 100755 index 00000000..ce1b46b5 --- /dev/null +++ b/workflows/cp-leaveout/scripts/report-stopping.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -eu + +# REPORT STOPPING SH +# Report early stopping by epoch + +THIS=$( realpath $( dirname $0 ) ) +CPLO=$( realpath $THIS/.. ) +SUPERVISOR=$( realpath $CPLO/../.. ) + +source $SUPERVISOR/workflows/common/sh/utils.sh + +SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ + DIR - ${*} + +if ! [[ -d $DIR ]] +then + echo "Does not exist: $DIR" + exit 1 +fi + +export PYTHONPATH+=:$SUPERVISOR/workflows/common/python + +set -x +python3 -u $THIS/report_stopping.py $DIR diff --git a/workflows/cp-leaveout/scripts/report_leaves.py b/workflows/cp-leaveout/scripts/report_leaves.py new file mode 100644 index 00000000..6ca2736a --- /dev/null +++ b/workflows/cp-leaveout/scripts/report_leaves.py @@ -0,0 +1,37 @@ +# REPORT LEAVES PY + +import argparse +import os +import pickle +import sys + +from Node import Node +from utils import fail + +parser = argparse.ArgumentParser(description="Report nodes with no children.") +parser.add_argument("directory", help="The experiment directory (EXPID)") + +args = parser.parse_args() + +node_pkl = args.directory + "/node-info.pkl" + +try: + with open(node_pkl, "rb") as fp: + data = pickle.load(fp) +except IOError as e: + fail(e, os.EX_IOERR, "Could not read: " + node_pkl) + +nodes = data.keys() +leaves = data.copy() + +for node in nodes: + parent = node[0:-2] + if parent in leaves: + print("drop: " + parent) + del leaves[parent] + +results = list(leaves.keys()) +results.sort() + +for leaf in results: + print(leaf) diff --git a/workflows/cp-leaveout/scripts/report_stopping.py b/workflows/cp-leaveout/scripts/report_stopping.py new file mode 100644 index 00000000..20d1f5e2 --- /dev/null +++ b/workflows/cp-leaveout/scripts/report_stopping.py @@ -0,0 +1,40 @@ +# REPORT STOPPING PY + +import argparse +import os +import pickle +import sys + +from Node import Node +from utils import avg, fail + +parser = argparse.ArgumentParser(description="Report nodes with no children.") +parser.add_argument("directory", help="The experiment directory (EXPID)") + +args = parser.parse_args() + +node_pkl = args.directory + "/node-info.pkl" + +try: + with open(node_pkl, "rb") as fp: + data = pickle.load(fp) +except IOError as e: + fail(e, os.EX_IOERR, "Could not read: " + node_pkl) + +stages = {1: [], 2: [], 3: [], 4: [], 5: [], 6: []} + +for key in data: + # print(key) + node = data[key] + # print(data[node]) + print("%-14s %i %i" % (key, node.stage, node.epochs_actual)) + stages[node.stage].append(node.epochs_actual) + # exit() + +for i in range(1, 7): + L = stages[i] + a = avg(L) + print("%i: %0.3f" % (i, a)) + + # a = st + # 1.3.2.4.2.4.1 diff --git a/workflows/cp-leaveout/scripts/stage-avg.py b/workflows/cp-leaveout/scripts/stage-avg.py index ea938dbe..efdf426b 100755 --- a/workflows/cp-leaveout/scripts/stage-avg.py +++ b/workflows/cp-leaveout/scripts/stage-avg.py @@ -28,21 +28,23 @@ stage, run = tokens[0:2] # print(stage, run) offset = 2 - for index in range(0,len(labels)): + for index in range(0, len(labels)): label = labels[index] if stage not in data[label]: data[label][stage] = [] - data[label][stage].append(tokens[offset+index]) + data[label][stage].append(tokens[offset + index]) # Debug dump of all data: # print(data) + def avg(L): s = 0.0 for v in L: s += float(v) return s / len(L) + def mean_confidence_interval(data, confidence=0.95): import numpy as np import scipy.stats @@ -51,10 +53,11 @@ def mean_confidence_interval(data, confidence=0.95): a = 1.0 * np.array(data) n = len(a) m, se = np.mean(a), scipy.stats.sem(a) - h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1) - c = 100.0 * h / m # Interval scaled to mean + h = se * scipy.stats.t.ppf((1 + confidence) / 2.0, n - 1) + c = 100.0 * h / m # Interval scaled to mean return m, h, c + # Average each data[label][stage] and report print("# %-5s %-6s AVG" % ("STAT", "STAGE")) for label in labels: diff --git a/workflows/cp-leaveout/scripts/stage-builds.cfg b/workflows/cp-leaveout/scripts/stage-builds.cfg new file mode 100644 index 00000000..344a3c50 --- /dev/null +++ b/workflows/cp-leaveout/scripts/stage-builds.cfg @@ -0,0 +1,13 @@ +xlabel = stage +ylabel = dataframe build time (seconds) + +width = 400 +height = 400 + +label.builds-X743.data = Summit E=10 +# label.builds-X744.data = Summit E=50 +label.builds-X750.data = Spock E=10 +# label.builds-X746.data = Spock E=50 + + +# legend.enabled = false diff --git a/workflows/cp-leaveout/scripts/stage-loads.cfg b/workflows/cp-leaveout/scripts/stage-loads.cfg new file mode 100644 index 00000000..80013aec --- /dev/null +++ b/workflows/cp-leaveout/scripts/stage-loads.cfg @@ -0,0 +1,13 @@ +xlabel = stage +ylabel = load weights time (seconds) + +width = 400 +height = 400 + +label.loads-X743.data = Summit E=10 +# label.builds-X744.data = Summit E=50 +label.loads-X750.data = Spock E=10 +# label.builds-X746.data = Spock E=50 + + +# legend.enabled = false diff --git a/workflows/cp-leaveout/scripts/stage-times.cfg b/workflows/cp-leaveout/scripts/stage-times.cfg new file mode 100644 index 00000000..162fc79c --- /dev/null +++ b/workflows/cp-leaveout/scripts/stage-times.cfg @@ -0,0 +1,14 @@ +xlabel = stage +ylabel = time (seconds) + +width = 600 +height = 400 + +label.times-X743.data = Summit E=10 +label.times-X744.data = Summit E=50 +label.times-X750.data = Spock E=10 +label.times-X746.data = Spock E=50 + +axis.type.x = integer + +# legend.enabled = false diff --git a/workflows/cp-leaveout/scripts/stage-vloss.cfg b/workflows/cp-leaveout/scripts/stage-vloss.cfg new file mode 100644 index 00000000..dadeddb3 --- /dev/null +++ b/workflows/cp-leaveout/scripts/stage-vloss.cfg @@ -0,0 +1,13 @@ +xlabel = stage +ylabel = validation loss + +width = 800 +height = 600 + +label.vloss-X743.data = Summit E=10 +label.vloss-X744.data = Summit E=50 +label.vloss-X750.data = Spock E=10 +label.vloss-X746.data = Spock E=50 + + +# legend.enabled = false diff --git a/workflows/cp-leaveout/scripts/stage-writes.cfg b/workflows/cp-leaveout/scripts/stage-writes.cfg new file mode 100644 index 00000000..90853d4f --- /dev/null +++ b/workflows/cp-leaveout/scripts/stage-writes.cfg @@ -0,0 +1,13 @@ +xlabel = stage +ylabel = checkpoint write rate (MB/s) + +width = 400 +height = 400 + +label.writes-X743.data = Summit E=10 +# label.builds-X744.data = Summit E=50 +label.writes-X750.data = Spock E=10 +# label.builds-X746.data = Spock E=50 + + +# legend.enabled = false diff --git a/workflows/cp-leaveout/scripts/tar-experiment.sh b/workflows/cp-leaveout/scripts/tar-experiment.sh index 566c9279..161be878 100755 --- a/workflows/cp-leaveout/scripts/tar-experiment.sh +++ b/workflows/cp-leaveout/scripts/tar-experiment.sh @@ -10,11 +10,31 @@ SUPERVISOR=$( readlink --canonicalize $THIS/../../.. ) source $SUPERVISOR/workflows/common/sh/utils.sh SIGNATURE -H "Provide an experiment DIR (e.g., .../experiments/X042)!" \ - DIR - ${*} + -H "Provide a MODE (STATS or INFER)!" \ + DIR MODE - ${*} # Get directory named "experiments" EXPERIMENTS=$( readlink --canonicalize $( dirname $DIR ) ) EXPID=$( basename $DIR ) +if [[ $MODE == "STATS" ]] +then + # For Node.py stats processing + OPTIONS=( --exclude '*.tsv' --exclude '*.h5' ) + Z="z" + EXT="tgz" +elif [[ $MODE == "INFER" ]] +then + # For inferencing runs + echo "find ..." + MATCHES=( -name '*.json' -or -name 'uno*.log' -or -name 'uno*.h5' ) + find $DIR ${MATCHES[@]} > tar.list + OPTIONS=( --files-from=tar.list ) + DIR="" # Unset this- only files in tar.list are included + Z="" + EXT="tar" +fi + set -x -nice tar cfz $EXPERIMENTS/$EXPID.tgz --exclude '*.h5' --exclude '*.tsv' $DIR +nice tar cf$Z $EXPERIMENTS/$EXPID.$EXT ${OPTIONS[@]} $DIR +du -h $EXPERIMENTS/$EXPID.$EXT diff --git a/workflows/cp-leaveout/scripts/time-nvm.data b/workflows/cp-leaveout/scripts/time-nvm.data index fbbf8cd3..56c3d494 100644 --- a/workflows/cp-leaveout/scripts/time-nvm.data +++ b/workflows/cp-leaveout/scripts/time-nvm.data @@ -5,4 +5,3 @@ restart/0 2 16.35 restart/1 16 14.83 restart/2 126 13.75 restart/3 254 11.69 - diff --git a/workflows/cp-leaveout/scripts/touch-all.sh b/workflows/cp-leaveout/scripts/touch-all.sh new file mode 100755 index 00000000..2b60e592 --- /dev/null +++ b/workflows/cp-leaveout/scripts/touch-all.sh @@ -0,0 +1,16 @@ +#!/bin/sh +set -eu + +# TOUCH ALL SH +# Touch all files in given experiment directories +# to prevent auto-deletion +# Finds dot files too + +THIS=$( readlink --canonicalize $( dirname $0 ) ) + +{ + for DIR in $* + do + nice find $DIR + done +} | $THIS/count-lines.awk | xargs -n 16 touch diff --git a/workflows/cp-leaveout/scripts/touch-exps.zsh b/workflows/cp-leaveout/scripts/touch-exps.zsh new file mode 100755 index 00000000..497c727c --- /dev/null +++ b/workflows/cp-leaveout/scripts/touch-exps.zsh @@ -0,0 +1,43 @@ +#!/bin/zsh +set -eu + +which python + +A=( # 750 + # 746 + # 757 + # 771 + # 743 + # 744 + # 759 + # 763 + # 828 + 838 + 839 + ) + +{ + sw0 + print "START: " $( date-nice ) + print + + for X in $A + do + print experiments/X$X + ds experiments/X$X + last-access experiments/X$X + touch-all experiments/X$X + print + done + + last-access ~/S/proj + touch-all ~/S/proj + print + + last-access /gpfs/alpine/med106/scratch/wozniak/CANDLE-Data + touch-all /gpfs/alpine/med106/scratch/wozniak/CANDLE-Data + + print + print "STOP: " $( date-nice ) + sw1 +} |& teeb touch-exps.out diff --git a/workflows/cp-leaveout/scripts/workflow-stats.py b/workflows/cp-leaveout/scripts/workflow-stats.py index 731d6526..40b07492 100644 --- a/workflows/cp-leaveout/scripts/workflow-stats.py +++ b/workflows/cp-leaveout/scripts/workflow-stats.py @@ -1,18 +1,20 @@ - # WORKFLOW STATS PY -import argparse, math, os, pickle, sys +import argparse +import math +import os +import pickle +import sys from Node import Node from utils import fail -parser = argparse.ArgumentParser(description='Print workflow total stats') -parser.add_argument('directory', - help='The experiment directory (EXPID)') -parser.add_argument('--percentiles', action='store_true', - help='If given, run percentiles analysis') -parser.add_argument('--token', default=None, - help='User-readable naming token') +parser = argparse.ArgumentParser(description="Print workflow total stats") +parser.add_argument("directory", help="The experiment directory (EXPID)") +parser.add_argument("--percentiles", + action="store_true", + help="If given, run percentiles analysis") +parser.add_argument("--token", default=None, help="User-readable naming token") args = parser.parse_args() @@ -25,18 +27,18 @@ node_pkl = args.directory + "/node-info.pkl" try: - with open(node_pkl, 'rb') as fp: + with open(node_pkl, "rb") as fp: data = pickle.load(fp) except IOError as e: fail(e, os.EX_IOERR, "Could not read: " + node_pkl) # print(data) + class Statter: - ''' - Compute states for some quantity (epochs_actual, stops, val_loss) - by stage - ''' + """Compute states for some quantity (epochs_actual, stops, val_loss) by + stage.""" + def __init__(self, name=None, token=None): self.data = {} self.name = name @@ -61,7 +63,8 @@ def percentile(self, stage, percentile): self.data[stage].sort(reverse=True) n = len(self.data[stage]) i = round(percentile * n) - 1 - if i < 0: i = 0 + if i < 0: + i = 0 return self.data[stage][i] def report_avg(self): @@ -81,41 +84,43 @@ def string_avg(self): return result def string_avg_pct(self): - ''' Average as percentage, i.e., x100 ''' + """Average as percentage, i.e., x100.""" keys = list(self.data.keys()) keys.sort() result = "# %s: avg %%\n" % self.name for key in keys: - result += "%i %6.2f\n" % (key, 100*self.avg(key)) + result += "%i %6.2f\n" % (key, 100 * self.avg(key)) return result def string_percentile(self, percentile): keys = list(self.data.keys()) keys.sort() - result = "# %s: %s: percentile %0.2f\n" % \ - (self.token, self.name, percentile) + result = "# %s: %s: percentile %0.2f\n" % (self.token, self.name, + percentile) for key in keys: result += "%i %0.4f\n" % (key, self.percentile(key, percentile)) return result + epochs = Statter("epochs by stage", token=args.token) -stops = Statter("stops by stage", token=args.token) +stops = Statter("stops by stage", token=args.token) losses = Statter("val_loss by stage", token=args.token) -times = Statter("times by stage", token=args.token) -count = 0 # Total Nodes -steps = 0 # Training steps -tm_s = 0.0 # Total training time +times = Statter("times by stage", token=args.token) +count = 0 # Total Nodes +steps = 0 # Training steps +tm_s = 0.0 # Total training time best_val_loss = Node(id="BEST") best_val_loss.val_loss = 1000 for node in data.values(): count += 1 steps += node.steps - tm_s += node.time + tm_s += node.time epochs.add(node.stage, node.epochs_actual) - stops .add(node.stage, node.stopped_early) + stops.add(node.stage, node.stopped_early) losses.add(node.stage, node.val_loss) - times.add(node.stage, node.total_time(data)) - if node.val_loss < best_val_loss.val_loss: best_val_loss = node + times.add(node.stage, node.total_time(data)) + if node.stage == 5 and node.val_loss < best_val_loss.val_loss: + best_val_loss = node tm_m = tm_s / 60 tm_h = tm_m / 60 @@ -133,18 +138,18 @@ def string_percentile(self, percentile): epochs.report_avg() + def do_percentiles(): for percentile in [0.99, 0.75, 0.50, 0.25, 0.10]: report = losses.string_percentile(percentile) - filename = 'percentile-%s-%0.2f.data' % \ - (args.token, percentile) - with open(filename, 'w') as fp: + filename = "percentile-%s-%0.2f.data" % (args.token, percentile) + with open(filename, "w") as fp: fp.write(report) + if args.percentiles: do_percentiles() print("best_val_loss: %s %0.2f hours , %i steps" % - (str(best_val_loss), - best_val_loss.total_time(data)/3600, + (str(best_val_loss), best_val_loss.total_time(data) / 3600, best_val_loss.steps)) diff --git a/workflows/cp-leaveout/swift/baseline-error.sh b/workflows/cp-leaveout/swift/baseline-error.sh new file mode 100755 index 00000000..52287243 --- /dev/null +++ b/workflows/cp-leaveout/swift/baseline-error.sh @@ -0,0 +1,200 @@ +#! /usr/bin/env bash +set -eu + +# BASELINE ERROR SH +# Main entry point for baseline-error workflow +# See README.adoc for more information + +# Autodetect this workflow directory +export EMEWS_PROJECT_ROOT=$( cd $( dirname $0 )/.. ; /bin/pwd ) +export WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. ; /bin/pwd ) +if [[ ! -d $EMEWS_PROJECT_ROOT/../../../Benchmarks ]] +then + echo "Could not find Benchmarks in: $EMEWS_PROJECT_ROOT/../../../Benchmarks" + exit 1 +fi +BENCHMARKS_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd) +export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} +BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/Uno +export BENCHMARK_TIMEOUT +export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} + +PYTHONPATH=${PYTHONPATH:-}:$BENCHMARK_DIR + +SCRIPT_NAME=$(basename $0) + +export FRAMEWORK="keras" + +# Source some utility functions used by EMEWS in this script +source $WORKFLOWS_ROOT/common/sh/utils.sh + +usage() +{ + echo "baseline-error.sh:" \ + "usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME " +} + +if (( ${#} < 5 )) +then + usage + exit 1 +fi + +set -x +if ! { + get_site $1 # Sets SITE + get_expid $2 # Sets EXPID + get_cfg_sys $3 + get_cfg_prm $4 + MODEL_NAME=$5 + } +then + usage + exit 1 +fi + +shift 5 +WORKFLOW_ARGS=$* + +echo "WORKFLOW.SH: Running model: $MODEL_NAME for EXPID: $EXPID" + +set +x + +source_site env $SITE +source_site sched $SITE + +PYTHONPATH+=:$EMEWS_PROJECT_ROOT/py # For plangen, data_setup +PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # For log_tools, model_runner +APP_PYTHONPATH+=:$EMEWS_PROJECT_ROOT/py # For plangen, data_setup +APP_PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # For log_tools +APP_PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common # For Benchmarks + +export TURBINE_JOBNAME="JOB:${EXPID}" + +if [[ ${GPU_STRING:-} == "" ]] +then + GPU_ARG="" +else + GPU_ARG="-gpus=$GPU_STRING" +fi + +CMD_LINE_ARGS=( --benchmark_timeout=$BENCHMARK_TIMEOUT + --site=$SITE + $GPU_ARG + $WORKFLOW_ARGS + ) + +if [[ $WORKFLOW_ARGS = "-r"* ]] +then + echo "Restart requested ..." + if [[ ! -d $TURBINE_OUTPUT ]] + then + echo "No prior run found! (tried $TURBINE_OUTPUT/output.txt)" + exit 1 + fi + if [[ ! -f $TURBINE_OUTPUT/output.txt ]] + then + # If output.txt does not exist, assume the moves already happened + echo "WARNING: The outputs were already moved from $EXPID" + else + next $TURBINE_OUTPUT/restarts/%i # cf. utils.sh:next() + PRIOR_RUN=$REPLY + echo "Moving old outputs to $PRIOR_RUN" + mkdir -pv $PRIOR_RUN + PRIORS=( $TURBINE_OUTPUT/output.txt + $TURBINE_OUTPUT/out + $TURBINE_OUTPUT/turbine* + $TURBINE_OUTPUT/jobid.txt ) + mv ${PRIORS[@]} $PRIOR_RUN + fi +else # Not a restart + if [[ -f $TURBINE_OUTPUT/output.txt ]] + then + echo "TURBINE_OUTPUT already exists- you must specify restart!" + echo "TURBINE_OUTPUT=$TURBINE_OUTPUT" + exit 1 + fi +fi + +USER_VARS=( $CMD_LINE_ARGS ) +# log variables and script to to TURBINE_OUTPUT directory +log_script + +# Make run directory in advance to reduce contention +mkdir -p $TURBINE_OUTPUT/run + +# Allow the user to set an objective function +OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} +# This is used by the obj_app objective function +export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh + +WORKFLOW_SWIFT=${WORKFLOW_SWIFT:-baseline-error.swift} +echo "WORKFLOW_SWIFT: $WORKFLOW_SWIFT" + +WAIT_ARG="" +if (( ${WAIT:-0} )) +then + WAIT_ARG="-t w" + echo "Turbine will wait for job completion." +fi + +# which python swift-t java + +if [[ ${MACHINE:-} == "" ]] +then + STDOUT=$TURBINE_OUTPUT/output.txt + # The turbine-output link is only created on scheduled systems, + # so if running locally, we create it here so the test*.sh wrappers + # can find it + [[ -L turbine-output ]] && rm turbine-output + ln -s $TURBINE_OUTPUT turbine-output +else + # When running on a scheduled system, Swift/T automatically redirects + # stdout to the turbine-output directory. This will just be for + # warnings or unusual messages + STDOUT="" +fi + +TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" +mkdir -pv $TURBINE_OUTPUT/out + +swift-t -O 0 -n $PROCS \ + ${MACHINE:-} \ + -p \ + -I $OBJ_DIR \ + -i $OBJ_MODULE \ + -I $EMEWS_PROJECT_ROOT/swift \ + -e LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} \ + -e BENCHMARKS_ROOT \ + -e EMEWS_PROJECT_ROOT \ + -e APP_PYTHONPATH=$APP_PYTHONPATH \ + $( python_envs ) \ + -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ + -e TURBINE_STDOUT=$TURBINE_STDOUT \ + -e OBJ_RETURN \ + -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ + -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ + -e MODEL_SH \ + -e MODEL_NAME \ + -e SITE \ + -e BENCHMARK_TIMEOUT \ + -e BENCHMARKS_ROOT \ + -e SH_TIMEOUT \ + -e IGNORE_ERRORS \ + -e TURBINE_DB_WORKERS=1 \ + $WAIT_ARG \ + $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} | \ + tee $STDOUT + +# -j /usr/bin/java # Give this to Swift/T if needed for Java +# -e PYTHONUNBUFFERED=1 # May be needed if error output is being lost + +if (( ${PIPESTATUS[0]} )) +then + echo "workflow.sh: swift-t exited with error!" + exit 1 +fi + +echo "WORKFLOW OK." +echo "EXIT CODE: 0" | tee -a $STDOUT diff --git a/workflows/cp-leaveout/swift/baseline-error.swift b/workflows/cp-leaveout/swift/baseline-error.swift new file mode 100644 index 00000000..4f3c4a12 --- /dev/null +++ b/workflows/cp-leaveout/swift/baseline-error.swift @@ -0,0 +1,89 @@ + +/** + BASELINE ERROR SWIFT + Runs the given nodes in new output directory based on + the pre-processed data in another "reference" directory +*/ + +import assert; +import files; +import io; +import python; +import string; +import sys; + +import candle_utils; +report_env(); + +// == Command-line Arguments Begin == +// The big feather file or CSV +string dataframe_csv = argv("dataframe_csv"); +// Actual CP workflow output directory to use for data sources: +string reference = argv("reference"); +// List of node IDs, one per line +file file_nodes = input(argv("nodes")); +// Mapping from node ID to epochs, one per line +// file file_epochs = input(argv("epochs")); +int benchmark_timeout = string2int(argv("benchmark_timeout", "-1")); +int epochs_all = string2int(argv("E", "50")); +int patience = string2int(argv("P", "50")); +// == Command-line Arguments End == + +// == Environment Settings Begin == +string model_name = getenv("MODEL_NAME"); +string exp_id = getenv("EXPID"); +string turbine_output = getenv("TURBINE_OUTPUT"); +// == Environment Settings End == + +// For compatibility with obj(): +global const string FRAMEWORK = "keras"; + +// Read file of node IDs: +string nodes_lines[] = file_lines(file_nodes); + +// Read file of epochs: +// string epochs_lines[] = file_lines(file_epochs); + +// // Mapping from node ID to epochs: +// string map_epochs[string]; +// foreach line in epochs_lines +// { +// tokens = split(line); +// map_epochs[tokens[0]] = tokens[1]; +// } + +// Resultant output values: +string results[]; + +// Templated parameters for all runs as JSON. +// Some keys must be filled in later. +string params_template = +---- +{ +"config_file": "uno_auc_model.txt", +"cache": "cache/top6_auc", +"dataframe_from": "%s", +"save_weights": "save/model.h5", +"gpus": "0", +"epochs": %i, +"es": "True", +"patience": %i, +"node": "%s", +"use_exported_data": "%s" +} +----; + +// Evaluate each parameter set +foreach node, i in nodes_lines +{ + printf("node: %s", node); + // Fill in missing hyperparameters: + string training_data = "%s/run/%s/topN.uno.h5" % (reference, node); + // int epochs = string2int(map_epochs[node]); + int epochs = epochs_all; + string params = params_template % (dataframe_csv, epochs, patience, + node, training_data); + // NOTE: obj() is in the obj_*.swift supplied by workflow.sh + results[i] = obj(params, node); + assert(results[i] != "EXCEPTION", "exception in obj()!"); +} diff --git a/workflows/cp-leaveout/swift/compute_epochs_none.swift b/workflows/cp-leaveout/swift/compute_epochs_none.swift new file mode 100644 index 00000000..907b500d --- /dev/null +++ b/workflows/cp-leaveout/swift/compute_epochs_none.swift @@ -0,0 +1,7 @@ + +/** + COMPUTE EPOCH NONE SWIFT + + This is a dummy module for workflows that do not actually + compute epochs. It has no code. +*/ diff --git a/workflows/cp-leaveout/swift/cpl-upf-workflow.sh b/workflows/cp-leaveout/swift/cpl-upf-workflow.sh index d6c60622..ac475a29 100755 --- a/workflows/cp-leaveout/swift/cpl-upf-workflow.sh +++ b/workflows/cp-leaveout/swift/cpl-upf-workflow.sh @@ -65,7 +65,7 @@ CPL_PY=$EMEWS_PROJECT_ROOT/../cp-leaveout/py PYTHONPATH+=:$EMEWS_PROJECT_ROOT/py: # For plangen, data_setup PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # For log_tools PYTHONPATH+=:$CPL_PY -PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common # For Benchmarks +PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common # For Benchmarks APP_PYTHONPATH+=:$EMEWS_PROJECT_ROOT/py # For plangen, data_setup APP_PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # For log_tools @@ -126,7 +126,7 @@ mkdir -p $TURBINE_OUTPUT/run # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh diff --git a/workflows/cp-leaveout/swift/cpl-upf-workflow.swift b/workflows/cp-leaveout/swift/cpl-upf-workflow.swift index 4bad5ef7..79a9e2d6 100644 --- a/workflows/cp-leaveout/swift/cpl-upf-workflow.swift +++ b/workflows/cp-leaveout/swift/cpl-upf-workflow.swift @@ -84,7 +84,7 @@ global const string FRAMEWORK = "keras"; r = obj(json2, node) => string hist_json = read_history(node); db_stop_result = plangen_stop(db_file, node, plan_id, hist_json) => - assert(db_stop_result != "EXCEPTION", "Exception in plangen_stop()!") => + assert(db_stop_result != "EXCEPTION", "Exception in plangen_stop()!") => printf("stop_subplan result: %s", db_stop_result); } else { printf("plan node already marked complete: %s result=%s", node, db_start_result) => @@ -194,6 +194,6 @@ main() { // string result = join(results, ";") => file out<"%s/plan_id.txt" % turbine_output> = write("%s\n" % plan_id); - write_lines(results, "results.txt") => + write_lines(results, "results.txt") => printf("CP LEAVEOUT WORKFLOW: RESULTS: COMPLETE"); } diff --git a/workflows/cp-leaveout/swift/plangen_0.swift b/workflows/cp-leaveout/swift/plangen_0.swift new file mode 100644 index 00000000..3d0dc530 --- /dev/null +++ b/workflows/cp-leaveout/swift/plangen_0.swift @@ -0,0 +1,25 @@ + +/* + PLANGEN 0 SWIFT + Disables plangen. Used by ResNet 50 problem +*/ + +(string result) plangen_check() +{ + result = "OK"; +} + +(string result) plangen_prep(string db_file, string plan_json, string runtype) +{ + result = "42"; +} + +(string result) plangen_start(string node, string plan_id) +{ + result = "0"; +} + +(string result) plangen_stop(string node, string plan_id) +{ + result = "OK"; +} diff --git a/workflows/cp-leaveout/swift/plangen.swift b/workflows/cp-leaveout/swift/plangen_1.swift similarity index 96% rename from workflows/cp-leaveout/swift/plangen.swift rename to workflows/cp-leaveout/swift/plangen_1.swift index cdd06452..fe1f8913 100644 --- a/workflows/cp-leaveout/swift/plangen.swift +++ b/workflows/cp-leaveout/swift/plangen_1.swift @@ -1,3 +1,9 @@ + +/* + PLANGEN 1 SWIFT + An early attempt at plangen with FS locks - did not work. +*/ + import python; pragma worktypedef DB; diff --git a/workflows/cp-leaveout/swift/plangen_2.swift b/workflows/cp-leaveout/swift/plangen_2.swift new file mode 100644 index 00000000..cb925439 --- /dev/null +++ b/workflows/cp-leaveout/swift/plangen_2.swift @@ -0,0 +1,96 @@ + +/* + PLANGEN 2 SWIFT + Currently working version for Challenge Problem Uno +*/ + +// This DB configuration and python_db() function will put all +// calls to python_db() on rank DB corresponding to +// environment variable TURBINE_DB_WORKERS: + +// Use plangen from Supervisor! + +pragma worktypedef DB; + +@dispatch=DB +(string output) python_db(string code, string expr="repr(0)") +"turbine" "0.1.0" + [ "set <> [ turbine::python 1 1 <> <> ]" ]; + +// Simply use python_db() to log the DB rank: +python_db( +---- +import os, sys +print("This rank is the DB rank: %s" % os.getenv("ADLB_RANK_SELF")) +sys.stdout.flush() +---- +); + +(string check) plangen_check() { + // Simple test that we can import plangen + check = python_db(---- +try: + import plangen + result = 'OK' +except Exception as e: + result = str(e) + ----, + "result"); +} + +(string result) plangen_prep(string db_file, string plan_json, string runtype) +{ +// Initialize the DB +result = python_persist( +---- +import sys, traceback +import plangen +try: + result = str(plangen.plan_prep('%s', '%s', '%s')) +except Exception as e: + info = sys.exc_info() + s = traceback.format_tb(info[2]) + print(str(e) + ' ... \\n' + ''.join(s)) + sys.stdout.flush() + result = 'EXCEPTION' +---- % (db_file, plan_json, runtype), +"result"); +} + +(string result) plangen_start(string node, string plan_id) +{ + result = python_db( +---- +import sys, traceback +import plangen +try: + result = str(plangen.start_subplan('%s', '%s', %s, '%s', '%s')) +except Exception as e: + info = sys.exc_info() + s = traceback.format_tb(info[2]) + print('EXCEPTION in plangen_start()\\n' + + str(e) + ' ... \\n' + ''.join(s)) + sys.stdout.flush() + result = "EXCEPTION" +---- % (db_file, plan_json, plan_id, node, runtype), + "result"); +} + +(string result) plangen_stop(string node, string plan_id) +{ + result = python_db( +---- +import plangen +import fcntl, sys, traceback +try: + result = str(plangen.stop_subplan('%s', '%s', '%s', {})) +except Exception as e: + info = sys.exc_info() + s = traceback.format_tb(info[2]) + sys.stdout.write('EXCEPTION in plangen_stop()\\n' + + str(e) + ' ... \\n' + ''.join(s) + '\\n') + sys.stdout.flush() + result = 'EXCEPTION' +---- % (db_file, plan_id, node), + "result"); +} diff --git a/workflows/cp-leaveout/swift/sweep.swift b/workflows/cp-leaveout/swift/sweep.swift deleted file mode 100644 index 837241b7..00000000 --- a/workflows/cp-leaveout/swift/sweep.swift +++ /dev/null @@ -1,61 +0,0 @@ - -/* - CP LEAVEOUT SWIFT - Main workflow -*/ - -import assert; -import files; -import io; -import python; -import unix; -import sys; -import string; -import location; -import math; - -string FRAMEWORK = "keras"; - -string xcorr_root = getenv("XCORR_ROOT"); -string preprocess_rnaseq = getenv("PREPROP_RNASEQ"); -string emews_root = getenv("EMEWS_PROJECT_ROOT"); -string turbine_output = getenv("TURBINE_OUTPUT"); - -printf("TURBINE_OUTPUT: " + turbine_output); - -string db_file = argv("db_file"); -string cache_dir = argv("cache_dir"); -// string xcorr_data_dir = argv("xcorr_data_dir"); -string gpus = argv("gpus", ""); - -// string restart_number = argv("restart_number", "1"); -string site = argv("site"); - -int N = 4; // The divisor of the leave out rows/columns - -int X[] = [0:0]; -int Y[] = [0:N]; - -string results[][]; - -app (file o) fake_uno(int leaveout_cell_line, int leaveout_drug) -{ - (emews_root/"swift/fake-uno.sh") leaveout_cell_line leaveout_drug o ; -} - -app (file o) fake_nt3(int leaveout_punch_x, int leaveout_punch_y) -{ - (emews_root/"swift/fake-nt3.sh") leaveout_punch_x leaveout_punch_y o ; -} - -foreach punch_x in X -{ - foreach punch_y in Y - { - file f = fake_nt3(punch_x, punch_y); - results[punch_x][punch_y] = read(f); - } -} - -// The test*.sh scripts check for "RESULTS:" -printf("RESULTS: %i", size(results)); diff --git a/workflows/cp-leaveout/swift/workflow-tic.sh b/workflows/cp-leaveout/swift/workflow-tic.sh index 8206830c..f1229f86 100755 --- a/workflows/cp-leaveout/swift/workflow-tic.sh +++ b/workflows/cp-leaveout/swift/workflow-tic.sh @@ -117,7 +117,7 @@ mkdir -p $TURBINE_OUTPUT/run # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh diff --git a/workflows/cp-leaveout/swift/workflow.sh b/workflows/cp-leaveout/swift/workflow.sh index c67cc096..5fd30a2a 100755 --- a/workflows/cp-leaveout/swift/workflow.sh +++ b/workflows/cp-leaveout/swift/workflow.sh @@ -1,5 +1,6 @@ #! /usr/bin/env bash set -eu +shopt -s nullglob # CP-LEAVEOUT WORKFLOW # Main entry point for CP-LEAVEOUT workflow @@ -19,7 +20,7 @@ BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/Uno export BENCHMARK_TIMEOUT export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} -PYTHONPATH=${PYTHONPATH:-}:$BENCHMARK_DIR +export PYTHONPATH=${PYTHONPATH:-}:$BENCHMARK_DIR:$BENCHMARKS_ROOT/Pilot1/Uno SCRIPT_NAME=$(basename $0) @@ -30,10 +31,11 @@ source $WORKFLOWS_ROOT/common/sh/utils.sh usage() { - echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME" + echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME EPOCH_MODE" + echo " EPOCH_MODE is one of the compute_epochs_*.swift modules." } -if (( ${#} < 5 )) +if (( ${#} < 6 )) then usage exit 1 @@ -45,7 +47,7 @@ if ! { get_cfg_sys $3 get_cfg_prm $4 MODEL_NAME=$5 - EPOCH_MODE=${6:-log} # Default to log mode + EPOCH_MODE=$6 } then usage @@ -57,16 +59,25 @@ WORKFLOW_ARGS=$* echo "WORKFLOW.SH: Running model: $MODEL_NAME for EXPID: $EXPID" +if [[ ${CANDLE_DATA_DIR:-} == "" ]] +then + echo "workflow.sh: You must set CANDLE_DATA_DIR" + exit 1 +fi + source_site env $SITE source_site sched $SITE -PYTHONPATH+=:$EMEWS_PROJECT_ROOT/py # For plangen, data_setup +# Note: insist on plangen from Supervisor! +PYTHONPATH=$EMEWS_PROJECT_ROOT/py:$PYTHONPATH # For plangen, data_setup PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # For log_tools, model_runner APP_PYTHONPATH+=:$EMEWS_PROJECT_ROOT/py # For plangen, data_setup APP_PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # For log_tools APP_PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common # For Benchmarks +export APP_PYTHONPATH -export TURBINE_JOBNAME="JOB:${EXPID}" +# Job name limit on Frontier: 8 +export TURBINE_JOBNAME=$EXPID if [ -z ${GPU_STRING+x} ]; then @@ -95,6 +106,7 @@ fi CMD_LINE_ARGS=( --benchmark_timeout=$BENCHMARK_TIMEOUT --site=$SITE --db_file=$DB_FILE + --user=$USER $GPU_ARG $WORKFLOW_ARGS ) @@ -104,24 +116,30 @@ then echo "Restart requested ..." if [[ ! -d $TURBINE_OUTPUT ]] then - echo "No prior run found! (tried $TURBINE_OUTPUT/output.txt)" + echo "ERROR: No prior run found! (tried $TURBINE_OUTPUT)" + exit 1 + fi + if [[ ! -f $TURBINE_OUTPUT/cplo.db ]] + then + echo "ERROR: No DB found! (tried $TURBINE_OUTPUT/cplo.db)" exit 1 fi if [[ ! -f $TURBINE_OUTPUT/output.txt ]] then # If output.txt does not exist, assume the moves already happened - echo "The outputs were already moved from $EXPID" + echo "WARNING: The outputs were already moved from $EXPID" else - next $TURBINE_OUTPUT/restarts/%i + next "$TURBINE_OUTPUT/restarts/%02i" # cf. utils.sh:next() PRIOR_RUN=$REPLY echo "Moving old outputs to $PRIOR_RUN" mkdir -pv $PRIOR_RUN PRIORS=( $TURBINE_OUTPUT/output.txt $TURBINE_OUTPUT/out $TURBINE_OUTPUT/turbine* - $TURBINE_OUTPUT/jobid.txt ) - mv ${PRIORS[@]} $PRIOR_RUN - cp -v $TURBINE_OUTPUT/cplo.db $PRIOR_RUN + $TURBINE_OUTPUT/jobid.txt + $TURBINE_OUTPUT/plangen_db.log* ) + mv ${PRIORS[@]} $PRIOR_RUN + cp $TURBINE_OUTPUT/cplo.db $PRIOR_RUN fi else # Not a restart if [[ -f $TURBINE_OUTPUT/output.txt ]] @@ -140,13 +158,18 @@ log_script mkdir -p $TURBINE_OUTPUT/run # Allow the user to set an objective function -OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +CANDLE_MODEL_DIR=${CANDLE_MODEL_DIR:-$WORKFLOWS_ROOT/common/swift} +CANDLE_MODEL_MODULE=${CANDLE_MODEL_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh EPOCH_MODE_MODULE="compute_epochs_$EPOCH_MODE" +if [[ ! -f swift/$EPOCH_MODE_MODULE.swift ]] +then + abort "workflow.sh: No such EPOCH_MODE: swift/$EPOCH_MODE_MODULE.swift" +fi + WORKFLOW_SWIFT=${WORKFLOW_SWIFT:-workflow.swift} echo "WORKFLOW_SWIFT: $WORKFLOW_SWIFT" @@ -166,7 +189,7 @@ then : fi -# which python swift-t java +which python swift-t java if [[ ${MACHINE:-} == "" ]] then @@ -183,40 +206,78 @@ else STDOUT="" fi -TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" +TURBINE_STDOUT="" +if [[ $SITE == "summit" || $SITE == "frontier" ]] +then + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" +else + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%r.txt" +fi mkdir -pv $TURBINE_OUTPUT/out +LD_LIBRARY_PATH=/opt/cray/libfabric/1.15.2.0/lib64 + +export MODEL_RETURN="val_loss" + +export TURBINE_LEADER_HOOK_STARTUP="$( sed 's/#.*//;s/$/;/' $EMEWS_PROJECT_ROOT/swift/hook-1.tcl )" + +# Environment variables KEY=VALUE passed into workflow. +# If exported, a VALUE does not need to be provided. +ENVS=( + # Where the Benchmarks are: + BENCHMARKS_ROOT + # The top-level directory for this workflow: + EMEWS_PROJECT_ROOT + # This will be pre-pended into PYTHONPATH if model.sh is used: + APP_PYTHONPATH + # Tell Python to auto-flush stdout: + PYTHONUNBUFFERED=1 + # Other site-specific Python settings: + # $( python_envs ) + # The CANDLE model: + MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} + MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} + # Location of model.sh: + MODEL_SH + # The CANDLE model name: + MODEL_NAME + # The statistic to return from each model: + MODEL_RETURN + # The computing site we are running on: + SITE + # A timeout in seconds for each model: + BENCHMARK_TIMEOUT + SH_TIMEOUT + # If 1, do not crash workflow on model errors: + IGNORE_ERRORS +) + +# Number of ranks to allocate for the DB: +export TURBINE_DB_WORKERS=1 + +# Insert -e flags for Swift/T command line: +ENV_ARG="-e $( echo ${ENVS[@]} | sed 's/ */ -e /g' )" + +export TURBINE_LOG=0 + swift-t -O 0 -n $PROCS \ ${MACHINE:-} \ - -p -I $EQR -r $EQR \ - -I $OBJ_DIR \ - -i $OBJ_MODULE \ + -p \ + -I $CANDLE_MODEL_DIR \ + -i $CANDLE_MODEL_MODULE \ -I $EMEWS_PROJECT_ROOT/swift \ -i $EPOCH_MODE_MODULE \ - -e LD_LIBRARY_PATH=$LD_LIBRARY_PATH \ - -e BENCHMARKS_ROOT \ - -e EMEWS_PROJECT_ROOT \ - -e APP_PYTHONPATH=$APP_PYTHONPATH \ - $( python_envs ) \ - -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ - -e TURBINE_STDOUT=$TURBINE_STDOUT \ - -e OBJ_RETURN \ - -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ - -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ - -e MODEL_SH \ - -e MODEL_NAME \ - -e SITE \ - -e BENCHMARK_TIMEOUT \ - -e BENCHMARKS_ROOT \ - -e SH_TIMEOUT \ - -e IGNORE_ERRORS \ - -e TURBINE_DB_WORKERS=1 \ + ${ENV_ARG} \ $WAIT_ARG \ - $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} | \ - tee $STDOUT + $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} + # | \ + # tee $STDOUT +# -e LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} \ +# -e USER # Needed on Summit to find NVME # -j /usr/bin/java # Give this to Swift/T if needed for Java -# -e PYTHONUNBUFFERED=1 # May be needed if error output is being lost +# -e PYTHONVERBOSE=1 # Debugs module load confusion + if (( ${PIPESTATUS[0]} )) then @@ -224,5 +285,16 @@ then exit 1 fi +# # Check job output +# TURBINE_OUTPUT=$( readlink turbine-output ) +# OUTPUT=turbine-output/output.txt +# WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +# # Wait for job +# queue_wait + +# SCRIPT=$( basename $0 .sh ) +# check_output "EXIT CODE: 0" $OUTPUT $WORKFLOW $SCRIPT $JOBID + echo "WORKFLOW OK." echo "EXIT CODE: 0" | tee -a $STDOUT diff --git a/workflows/cp-leaveout/swift/workflow.swift b/workflows/cp-leaveout/swift/workflow.swift index b166b301..8d19f639 100644 --- a/workflows/cp-leaveout/swift/workflow.swift +++ b/workflows/cp-leaveout/swift/workflow.swift @@ -4,11 +4,14 @@ Simply run with: 'swift-t workflow.swift ' Or specify the N, S values: - 'swift-t workflow.swift -N=6 -S=6 ' - for 55,986 tasks. + 'swift-t workflow.swift -N=4 -S=6 ' + for ### tasks. + Flags: -N : Number of nodes per stage (see default in code) -S : Number of stages (see default in code) + -E : Number of epochs (see default in Benchmark) + -P : Early stopping patience (see default in code) -r : Use RunType.RESTART, default is RunType.RUN_ALL RUN_ALL means this is a fresh run with no prior results @@ -20,10 +23,15 @@ --benchmark_data= : Used by data_setup to set softlinks to Uno cache and uno_auc_model.txt + NOTE: "token" variables are used to ensure that children do + not run before their parents NOTE: This workflow has some complex Python Exception handling code that will be pushed into Swift/T for conciseness... NOTE: On Summit, you have to use sys.stdout.flush() after Python output on stdout + + RESTART EXAMPLE: + test/test-512.sh summit EXP003 flat -r -N=4 -S=6 -E=5 -P=5 */ import assert; @@ -35,6 +43,7 @@ import string; import sys; import candle_utils; +import plangen_2; report_env(); @@ -51,7 +60,6 @@ else N = 0; } // Maximum stage number with default -// (tested up to S=7, 21,844 dummy tasks) int S; S_s = argv("S", "2"); assert(strlen(S_s) > 0, "Set argument S with -S=") => @@ -65,12 +73,16 @@ else { runtype = "plangen.RunType.RUN_ALL"; } -E_s = argv("E", "20"); +E_s = argv("E", "50"); assert(strlen(E_s) > 0, "workflow.swift: you must provide an argument to -E"); int max_epochs = string2int(E_s); // epochs=20 is just under 2h on Summit. +P_s = argv("P", "10"); +assert(strlen(P_s) > 0, "workflow.swift: you must provide an argument to -P"); +int early_stopping = string2int(P_s); string plan_json = argv("plan_json"); string dataframe_csv = argv("dataframe_csv"); string db_file = argv("db_file"); +string user = argv("user", "NONE"); // for Summit NVME string benchmark_data = argv("benchmark_data"); int epoch_mode = string2int(argv("epoch_mode", "1")); int benchmark_timeout = string2int(argv("benchmark_timeout", "-1")); @@ -79,33 +91,52 @@ string exp_id = getenv("EXPID"); string turbine_output = getenv("TURBINE_OUTPUT"); // END WORKFLOW ARGUMENTS +printf("plangen: runtype:" + runtype); +printf("benchmark_data: " + benchmark_data); + // For compatibility with obj(): global const string FRAMEWORK = "keras"; -/** RUN STAGE: A recursive function that manages the stage dependencies */ +/** + RUN STAGE: A recursive function that manages the stage dependencies + token_parent: Blocks progress until the parent node is done + parent: The parent node, e.g., "1.2.3" + this: The current node, e.g., "1.2.3.4" +*/ (void v) -run_stage(int N, int S, string this, int stage, void block, - string plan_id, string db_file, string runtype) +run_stage(string db_file, string plan_id, string runtype, + void token_parent, int stage, string parent, string this) { - - printf("stage: %i this: %s", stage, this); + // printf("stage: %i parent: %s this: %s", stage, parent, this); // Run the model - void parent = run_single(this, stage, block, plan_id); + void token_this = run_single(plan_id, token_parent, stage, + parent, this); if (stage < S) { // Recurse to the child stages - foreach id_child in [1:N] + foreach child in [1:N] { - run_stage(N, S, this+"."+id_child, stage+1, parent, - plan_id, db_file, runtype); + run_stage(db_file, plan_id, runtype, + token_this, stage+1, + this, + "%s.%i" % (this, child) + ); // N, S, } } v = propagate(); } -/** RUN SINGLE: Set up and run a single model via obj(), plus the SQL ops */ -(void v) run_single(string node, int stage, void block, string plan_id) +/** + RUN SINGLE: Set up and run a single model via obj(), plus the SQL ops + token: Block on token so that this node does not run until the + parent is complete. + stage: The current stage, e.g., 4 + parent: The parent node, e.g., "1.2.3" + this: The current node, e.g., "1.2.3.4" +*/ +(void v) run_single(string plan_id, void token, int stage, + string parent, string this) { if (stage == 0) { @@ -113,118 +144,72 @@ run_stage(int N, int S, string this, int stage, void block, } else { - json_fragment = make_json_fragment(node, stage); - json = "{\"node\": \"%s\", %s}" % (node, json_fragment); - block => - printf("run_single(): running obj(%s)", node) => + json_fragment = make_json_fragment(parent, this, stage); + json = "{\"node\": \"%s\", %s}" % (this, json_fragment); + token => + printf("run_single(): running candle_model_train(%s)", this) => // Insert the model run into the DB - result1 = plangen_start(node, plan_id); + result1 = plangen_start(this, plan_id); assert(result1 != "EXCEPTION", "Exception in plangen_start()!"); if (result1 == "0") { // Run the model - obj_result = obj(json, node) - // Update the DB to complete the model run - => result2 = plangen_stop(node, plan_id); + model_result = candle_model_train(json, exp_id, this, model_name); printf("run_single(): completed: node: '%s' result: '%s'", - node, obj_result); - assert(obj_result != "EXCEPTION" && obj_result != "", - "Exception in obj()!"); + this, model_result); + // Update the DB to complete the model run + string result2; + if (model_result != "RUN_EXCEPTION") + { + result2 = plangen_stop(this, plan_id); + } + else + { + result2 = "RETRY"; + } + assert(model_result != "", "Error in obj(): result is empty!"); + assert(model_result != "EXCEPTION", "Exception in obj()!"); assert(result2 != "EXCEPTION", "Exception in plangen_stop()!"); printf("run_single(): stop_subplan result: '%s'", result2); - v = propagate(obj_result); + v = propagate(model_result); } - else + else // result1 != 0 { printf("run_single(): plan node already marked complete: " + - "%s result=%s", node, result1) => + "%s result=%s", this, result1) => v = propagate(); } } } -// This DB configuration and python_db() function will put all -// calls to python_db() on rank DB corresponding to -// environment variable TURBINE_DB_WORKERS: - -pragma worktypedef DB; - -@dispatch=DB -(string output) python_db(string code, string expr="repr(0)") -"turbine" "0.1.0" - [ "set <> [ turbine::python 1 1 <> <> ]" ]; - -// Simply use python_db() to log the DB rank: -python_db( ----- -import os, sys -print("This rank is the DB rank: %s" % os.getenv("ADLB_RANK_SELF")) -sys.stdout.flush() ----- -); - -(string result) plangen_start(string node, string plan_id) -{ - result = python_db( ----- -import fcntl, sys, traceback -import plangen -try: - result = str(plangen.start_subplan('%s', '%s', %s, '%s', %s)) -except Exception as e: - info = sys.exc_info() - s = traceback.format_tb(info[2]) - print(str(e) + ' ... \\n' + ''.join(s)) - sys.stdout.flush() - result = "EXCEPTION" ----- % (db_file, plan_json, plan_id, node, runtype), - "result"); -} - -(string result) plangen_stop(string node, string plan_id) -{ - result = python_db( ----- -import plangen -import fcntl, sys, traceback -try: - result = str(plangen.stop_subplan('%s', '%s', '%s', {})) -except Exception as e: - info = sys.exc_info() - s = traceback.format_tb(info[2]) - sys.stdout.write(str(e) + ' ... \\n' + ''.join(s) + '\\n') - sys.stdout.flush() - result = 'EXCEPTION' ----- % (db_file, plan_id, node), - "result"); -} - /** MAKE JSON FRAGMENT: Construct the JSON parameter fragment for the model */ -(string result) make_json_fragment(string this, int stage) +(string result) make_json_fragment(string parent, string this, int stage) { int epochs = compute_epochs(stage); json_fragment = ---- -"pre_module": "data_setup", -"post_module": "data_setup", -"plan": "%s", -"config_file": "uno_auc_model.txt", -"cache": "cache/top6_auc", -"dataframe_from": "%s", -"save_weights": "model.h5", -"gpus": "0", -"epochs": %i, -"es": "True", +"pre_module": "data_setup", +"post_module": "data_setup", +"plan": "%s", +"config_file": "uno_auc_model.txt", +"cache": "cache/top6_auc", +"user": "%s", +"dataframe_from": "%s", +"save_weights": "save/model.h5", +"gpus": "0", +"epochs": %i, +"es": "True", +"early_stopping": %i, +"experiment_id": "%s", +"run_id": "%s", "use_exported_data": "topN.uno.h5", -"benchmark_data": "%s" +"benchmark_data": "%s" ---- % -(plan_json, dataframe_csv, epochs, benchmark_data); +(plan_json, user, dataframe_csv, epochs, early_stopping, exp_id, this, benchmark_data); if (stage > 1) { - n = strlen(this); - parent = substring(this, 0, n-2); result = json_fragment + ---- , -"initial_weights": "../%s/model.h5" +"initial_weights": "../%s/save/model.h5" ---- % parent; } else @@ -236,32 +221,10 @@ except Exception as e: printf("CP LEAVEOUT WORKFLOW: START: N=%i S=%i", N, S); // First: simple test that we can import plangen -check = python_persist(---- -try: - import plangen - result = 'OK' -except Exception as e: - result = str(e) -----, -"result"); -printf("python result: import plangen: '%s'", check) => - assert(check == "OK", "could not import plangen, check PYTHONPATH!"); +check = plangen_check(); +assert(check == "OK", "could not import plangen, check PYTHONPATH!"); -// Initialize the DB -plan_id = python_persist( ----- -import sys, traceback -import plangen -try: - result = str(plangen.plan_prep('%s', '%s', %s)) -except Exception as e: - info = sys.exc_info() - s = traceback.format_tb(info[2]) - print(str(e) + ' ... \\n' + ''.join(s)) - sys.stdout.flush() - result = 'EXCEPTION' ----- % (db_file, plan_json, runtype), -"result"); +plan_id = plangen_prep(db_file, plan_json, "NOTHING"); printf("DB plan_id: %s", plan_id); assert(plan_id != "EXCEPTION", "Plan prep failed!"); @@ -270,5 +233,5 @@ assert(plan_id != "-1", "Plan already exists!"); // Kickoff the workflow stage = 0; -run_stage(N, S, "1", stage, propagate(), plan_id, db_file, runtype); -// printf("CP LEAVEOUT WORKFLOW: RESULTS: COMPLETE"); +run_stage(db_file, plan_id, runtype, + propagate(), stage, "", "1"); diff --git a/workflows/cp-leaveout/test-chained/cfg-stage-sys.sh b/workflows/cp-leaveout/test-chained/cfg-stage-sys.sh index 77d03789..e27b7fc9 100644 --- a/workflows/cp-leaveout/test-chained/cfg-stage-sys.sh +++ b/workflows/cp-leaveout/test-chained/cfg-stage-sys.sh @@ -10,8 +10,8 @@ export PROCS=${PROCS:-12} # MPI processes per node. This should not exceed PROCS. export PPN=${PPN:-1} -# Benchmark run timeout: benchmark run will timeouT -# after the specified number of seconds. -1 is no timeout. +# Benchmark run timeout: benchmark run will timeouT +# after the specified number of seconds. -1 is no timeout. BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:--1} # Uncomment below to use custom python script to run @@ -35,9 +35,9 @@ export WALLTIME=${WALLTIME:-00:10:00} # queue export QUEUE=${QUEUE:-batch} -# += is necessary here as the job dependency args are +# += is necessary here as the job dependency args are # set via TURBINE_DIRECTIVE -TURBINE_DIRECTIVE="\n#BSUB -q $QUEUE\n#BSUB -alloc_flags \"NVME maximizegpfs\"\n" +TURBINE_DIRECTIVE="\n#BSUB -q $QUEUE\n#BSUB -alloc_flags \"NVME maximizegpfs\"\n" export TURBINE_DIRECTIVE+=${TURBINE_DIRECTIVE_ARGS:-} TURBINE_LAUNCH_OPTIONS="-a1 -c42 -g1" @@ -58,4 +58,3 @@ echo " IGNORE_ERRORS: $IGNORE_ERRORS" # export MAIL_ENABLED=1 # export MAIL_ADDRESS=wozniak@mcs.anl.gov - diff --git a/workflows/cp-leaveout/test-chained/cfg.json b/workflows/cp-leaveout/test-chained/cfg.json index 1dc4e65f..ddc2bc65 100644 --- a/workflows/cp-leaveout/test-chained/cfg.json +++ b/workflows/cp-leaveout/test-chained/cfg.json @@ -1,30 +1,30 @@ { - "site" : "summit", - "plan" : "/gpfs/alpine/med106/scratch/ncollier/job-chain/inputs/plangen_cell1593-p4_drug1779-p1.json", - "submit_script" : "./test-1.sh", - "upf_directory" : "/gpfs/alpine/med106/scratch/ncollier/job-chain/inputs", - "job_chain_arg" : "#BSUB -w done()", - "stages" : "-1", - "first_stage" : 1, - "first_stage_parent_directory" : "", - - "stage_cfg_script" : "./cfg-stage-sys.sh", + "site": "summit", + "plan": "/gpfs/alpine/med106/scratch/ncollier/job-chain/inputs/plangen_cell1593-p4_drug1779-p1.json", + "submit_script": "./test-1.sh", + "upf_directory": "/gpfs/alpine/med106/scratch/ncollier/job-chain/inputs", + "job_chain_arg": "#BSUB -w done()", + "stages": "-1", + "first_stage": 1, + "first_stage_parent_directory": "", - "stage_cfgs" : [ - { - "stage" : 1, - "WALLTIME" : "02:00:00", - "PROCS" : 6 - }, + "stage_cfg_script": "./cfg-stage-sys.sh", - { - "stage" : 2, - "WALLTIME" : "01:00:00" - }, + "stage_cfgs": [ + { + "stage": 1, + "WALLTIME": "02:00:00", + "PROCS": 6 + }, - { - "stage" : 3, - "WALLTIME" : "01:00:00" - } - ] + { + "stage": 2, + "WALLTIME": "01:00:00" + }, + + { + "stage": 3, + "WALLTIME": "01:00:00" + } + ] } diff --git a/workflows/cp-leaveout/test/cfg-sys-1.sh b/workflows/cp-leaveout/test/cfg-sys-1.sh index 02f7bb0f..21014013 100644 --- a/workflows/cp-leaveout/test/cfg-sys-1.sh +++ b/workflows/cp-leaveout/test/cfg-sys-1.sh @@ -4,7 +4,7 @@ # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEMS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-5} +export PROCS=${PROCS:-2} # # Number of processes to use for resident tasks, # # i.e., the number of mlrMBO instances to run @@ -15,15 +15,16 @@ export PROCS=${PROCS:-5} export PPN=${PPN:-1} # For Theta: -export QUEUE=${QUEUE:-debug-flat-quad} +# export QUEUE=${QUEUE:-debug-flat-quad} # export QUEUE=R.candle -export WALLTIME=${WALLTIME:-02:00:00} +export WALLTIME=${WALLTIME:-01:00:00} # command separated list of gpu ids # export GPU_STRING=${GPU_STRING:-0} #export TURBINE_LAUNCH_OPTIONS="-a6 -g6 -c42" -#export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" +export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" +# export TURBINE_LAUNCH_OPTIONS="-g3 -c21 -a2" # For PPN=2 # Does not work export TURBINE_DIRECTIVE="#BSUB -alloc_flags \"NVME maximizegpfs\"" #export PROJECT=Candle_ECP diff --git a/workflows/cp-leaveout/test/cfg-sys-512.sh b/workflows/cp-leaveout/test/cfg-sys-512.sh index 2c77c29e..43699e1c 100644 --- a/workflows/cp-leaveout/test/cfg-sys-512.sh +++ b/workflows/cp-leaveout/test/cfg-sys-512.sh @@ -6,23 +6,23 @@ # and 1 process is reserved for the DB client. # The default of 4 gives you 2 workers, # i.e., 2 concurrent Keras runs. -export PROCS=${PROCS:-4} +# Bin Min Nodes Max Nodes Max Walltime (Hours) Aging Boost (Days) +# 1 2,765 4,608 24.0 15 +# 2 922 2,764 24.0 10 +# 3 92 921 12.0 0 +# 4 46 91 6.0 0 +# 5 1 45 2.0 +export PROCS=${PROCS:-2048} # MPI processes per node # Cori has 32 cores per node, 128GB per node -export PPN=${PPN:-1} - -# For Theta: -export QUEUE=${QUEUE:-debug-flat-quad} -# export QUEUE=R.candle +export PPN=${PPN:-8} export WALLTIME=${WALLTIME:-12:00:00} # command separated list of gpu ids # export GPU_STRING=${GPU_STRING:-0} #export TURBINE_LAUNCH_OPTIONS="-a6 -g6 -c42" -#export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" -export TURBINE_DIRECTIVE="#BSUB -alloc_flags \"NVME maximizegpfs\"" #export PROJECT=Candle_ECP diff --git a/workflows/cp-leaveout/test/test-1.sh b/workflows/cp-leaveout/test/test-1.sh index 4ffd5501..afc4942d 100755 --- a/workflows/cp-leaveout/test/test-1.sh +++ b/workflows/cp-leaveout/test/test-1.sh @@ -34,11 +34,27 @@ export CFG_PRM=$THIS/cfg-prm-1.sh # Data files # PLAN_JSON=$EMEWS_PROJECT_ROOT/plangen_cell8-p2_drug8-p2.json -SCRATCH=/gpfs/alpine/med106/scratch/wozniak -CANDLE_DATA=$SCRATCH/CANDLE-Data -PLAN_JSON=$CANDLE_DATA/plangen_cell8-p2_drug8-p2.json -DATAFRAME_CSV=$CANDLE_DATA/top21_dataframe_8x8.csv -BENCHMARK_DATA=$SCRATCH/proj/Benchmarks/Pilot1/Uno +# DATAFRAME_CSV=/usb1/wozniak/CANDLE-Benchmarks-Data/top21_dataframe_8x8.csv + +PLAN_JSON=$EMEWS_PROJECT_ROOT/plangen_cell1593-p4_drug1779-p1.json +BENCHMARK_DATA=$HOME/proj/Benchmarks/Pilot1/Uno +SCRATCH=/usb1/wozniak/CANDLE-Benchmarks-Data +CANDLE_DATA=$SCRATCH/CANDLE-Data/Milestone-13 +DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.csv + +# Summit data: +# SCRATCH=/gpfs/alpine/med106/scratch/wozniak +# CANDLE_DATA=$SCRATCH/CANDLE-Data/Milestone-13 +# PLAN_JSON=$CANDLE_DATA/plangen_cell1593-p4_drug1779-p1.json +# DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.csv +# BENCHMARK_DATA=$SCRATCH/proj/Benchmarks/Pilot1/Uno + + +# SCRATCH=/gpfs/alpine/med106/scratch/wozniak +# CANDLE_DATA=$SCRATCH/CANDLE-Data +# PLAN_JSON=$CANDLE_DATA/plangen_cell8-p2_drug8-p2.json +# DATAFRAME_CSV=$CANDLE_DATA/top21_dataframe_8x8.csv +# BENCHMARK_DATA=$SCRATCH/proj/Benchmarks/Pilot1/Uno # What to return from the objective function (Keras model) # val_loss (default) and val_corr are supported @@ -71,7 +87,8 @@ OUTPUT=turbine-output/output.txt WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) # Wait for job -queue_wait +# queue_wait +exit SCRIPT=$( basename $0 .sh ) check_output "RESULTS:" $OUTPUT $WORKFLOW $SCRIPT $JOBID diff --git a/workflows/cp-leaveout/test/test-512.sh b/workflows/cp-leaveout/test/test-512.sh index eaa88766..3dff6707 100755 --- a/workflows/cp-leaveout/test/test-512.sh +++ b/workflows/cp-leaveout/test/test-512.sh @@ -32,17 +32,47 @@ source $WORKFLOWS_ROOT/common/sh/utils.sh export CFG_SYS=$THIS/cfg-sys-512.sh export CFG_PRM=$THIS/cfg-prm-1.sh +# # Data files +# # PLAN_JSON=$EMEWS_PROJECT_ROOT/plangen_cell8-p2_drug8-p2.json +# SCRATCH=/gpfs/alpine/med106/scratch/hsyoo +# CANDLE_DATA=$SCRATCH/Milestone13 +# PLAN_JSON=$CANDLE_DATA/plangen_cell1593-p4_drug1779-p1.json +# DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.csv +# BENCHMARK_DATA=$SCRATCH/Milestone13/Benchmarks/Pilot1/Uno + # Data files # PLAN_JSON=$EMEWS_PROJECT_ROOT/plangen_cell8-p2_drug8-p2.json -SCRATCH=/gpfs/alpine/med106/scratch/hsyoo -CANDLE_DATA=$SCRATCH/Milestone13 -PLAN_JSON=$CANDLE_DATA/plangen_cell1593-p4_drug1779-p1.json -DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.csv -BENCHMARK_DATA=$SCRATCH/Milestone13/Benchmarks/Pilot1/Uno +# SCRATCH=/gpfs/alpine/med106/scratch/hsyoo +SCRATCH=/gpfs/alpine/med106/scratch/wozniak +# SCRATCH=/usb2/wozniak +# CANDLE_DATA=$SCRATCH/CANDLE-Data/Milestone-13 +CANDLE_DATA=$SCRATCH/CANDLE-Data/ChallengeProblem/top21_2020Jul +CANDLE_DATA=$CANDLE_DATA_DIR/ChallengeProblem/top21_2020Jul +# CANDLE_DATA=$SCRATCH/CANDLE-Data/ChallengeProblem/old +# PLAN_JSON=$CANDLE_DATA/plangen_cell1593-p4_drug1779-p1.json +# DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.csv +# DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.feather +# DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.hdf5 +# if (( ! ${BIG_PLAN:-0} )) +# then +# PLAN_JSON=$CANDLE_DATA/plangen_cell703-p4_drug1492-p1-u.json # 2022-07 +# # PLAN_JSON=$CANDLE_DATA/plangen_CELL2917-p4_DRUG2148-p4.json # 2023-02 +# else +# PLAN_JSON=/gpfs/alpine/med106/proj-shared/brettin/Supervisor/workflows/cp-leaveout/plangen_CELL2917-p4_DRUG2148-p4.json +# fi +PLAN_JSON=$CANDLE_DATA/plangen_CELL703-p4_DRUG1492-p4.json +# DATAFRAME_CSV=$CANDLE_DATA/top21.h5 # 2022-07 +DATAFRAME_CSV=$CANDLE_DATA/top21-cleaned-dd.h5 # NEW 2022-10 +# BENCHMARK_DATA=$SCRATCH/proj/Benchmarks/Pilot1/Uno +# BENCHMARK_DATA=$HOME/proj/Benchmarks/Pilot1/Uno +BENCHMARK_DATA=$CANDLE_DATA +# PROJ_SHARED=/gpfs/alpine/med106/proj-shared/wozniak +# BENCHMARK_DATA=$PROJ_SHARED/proj/Benchmarks/Pilot1/Uno # What to return from the objective function (Keras model) -# val_loss (default) and val_corr are supported -export OBJ_RETURN="val_loss" +# val_loss (default), loss, and val_corr are supported +# export OBJ_RETURN="val_loss" +export OBJ_RETURN="loss" if [[ $SITE == "theta" ]] then @@ -58,6 +88,13 @@ do fi done +if [[ ! -d $BENCHMARK_DATA/cache ]] +then + echo "$0: The cache does not exist: $BENCHMARK_DATA/cache" + echo "$0: Use mkdir to create this directory" + exit 1 +fi + # Submit job $EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ $MODEL_NAME $WORKFLOW_ARGS \ @@ -71,7 +108,8 @@ OUTPUT=turbine-output/output.txt WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) # Wait for job -queue_wait +# queue_wait +exit SCRIPT=$( basename $0 .sh ) check_output "RESULTS:" $OUTPUT $WORKFLOW $SCRIPT $JOBID diff --git a/workflows/cp-leaveout/test/test-bl-1.sh b/workflows/cp-leaveout/test/test-bl-1.sh new file mode 100755 index 00000000..2422bece --- /dev/null +++ b/workflows/cp-leaveout/test/test-bl-1.sh @@ -0,0 +1,99 @@ +#!/bin/bash +set -eu + +# CP LEAVEOUT BASELINE TEST 1 + +usage() +{ + echo "Usage: test SITE EXPID WORKFLOW_ARGS" +} + +if (( ${#} == 0 )) +then + usage + exit 1 +fi + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +SITE=$1 +RUN_DIR=$2 +shift 2 +WORKFLOW_ARGS=$* + +export MODEL_NAME=uno # nt3 + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-512.sh +export CFG_PRM=$THIS/cfg-prm-1.sh + +# Data files +# PLAN_JSON=$EMEWS_PROJECT_ROOT/plangen_cell8-p2_drug8-p2.json +# DATAFRAME_CSV=/usb1/wozniak/CANDLE-Benchmarks-Data/top21_dataframe_8x8.csv + +# Data files +# SUMMIT: +# PLAN_JSON=$EMEWS_PROJECT_ROOT/plangen_cell8-p2_drug8-p2.json +# SCRATCH=/gpfs/alpine/med106/scratch/hsyoo +SCRATCH=/gpfs/alpine/med106/scratch/wozniak +# SCRATCH=/usb2/wozniak +CANDLE_DATA=$SCRATCH/CANDLE-Data/ChallengeProblem +PLAN_JSON=$CANDLE_DATA/plangen_cell1593-p4_drug1779-p1.json +# DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labled.csv +DATAFRAME_CSV=$CANDLE_DATA/top_21.res_reg.cf_rnaseq.dd_dragon7.labeled.hdf5 +# BENCHMARK_DATA=$SCRATCH/proj/Benchmarks/Pilot1/Uno +# BENCHMARK_DATA=$HOME/proj/Benchmarks/Pilot1/Uno +BENCHMARK_DATA=$CANDLE_DATA + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export OBJ_RETURN="val_loss" + +if [[ $SITE == "theta" ]] +then + export WAIT=1 +fi + +for f in $DATAFRAME_CSV +do + if [[ ! -f $f ]] + then + abort "$0: does not exist: $f" + fi +done + +if [[ $SITE == "summit" ]] +then + export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" +fi + +# Submit job +export WORKFLOW_SWIFT=baseline-error.swift +# set -x +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME none $WORKFLOW_ARGS \ + --dataframe_csv=$DATAFRAME_CSV \ + --benchmark_data=$BENCHMARK_DATA \ + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +OUTPUT=turbine-output/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +# Wait for job +queue_wait + +SCRIPT=$( basename $0 .sh ) +check_output "RESULTS:" $OUTPUT $WORKFLOW $SCRIPT $JOBID +check_output "EXIT CODE: 0" $OUTPUT $WORKFLOW $SCRIPT $JOBID + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: diff --git a/workflows/cp-leaveout/test/test-dunedin.sh b/workflows/cp-leaveout/test/test-dunedin.sh new file mode 100755 index 00000000..d710da70 --- /dev/null +++ b/workflows/cp-leaveout/test/test-dunedin.sh @@ -0,0 +1,89 @@ +#!/bin/bash +set -eu + +# CP LEAVEOUT TEST DUNEDIN + +usage() +{ + echo "Usage: test SITE EXPID WORKFLOW_ARGS" +} + +if (( ${#} == 0 )) +then + usage + exit 1 +fi + +SITE=$1 +RUN_DIR=$2 +shift 2 +WORKFLOW_ARGS=$* + +export MODEL_NAME=uno # nt3 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-1.sh +export CFG_PRM=$THIS/cfg-prm-1.sh + +# Data files +# PLAN_JSON=$EMEWS_PROJECT_ROOT/plangen_cell8-p2_drug8-p2.json + +PLAN_JSON=$EMEWS_PROJECT_ROOT/plangen_cell8-p2_drug8-p2.json +DATAFRAME_CSV=/usb1/wozniak/CANDLE-Benchmarks-Data/top21_dataframe_8x8.csv +BENCHMARK_DATA=$HOME/proj/Benchmarks/Pilot1/Uno + +# SCRATCH=/gpfs/alpine/med106/scratch/wozniak +# CANDLE_DATA=$SCRATCH/CANDLE-Data +# PLAN_JSON=$CANDLE_DATA/plangen_cell8-p2_drug8-p2.json +# DATAFRAME_CSV=$CANDLE_DATA/top21_dataframe_8x8.csv +# BENCHMARK_DATA=$SCRATCH/proj/Benchmarks/Pilot1/Uno + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export OBJ_RETURN="val_loss" + +if [[ $SITE == "theta" ]] +then + export WAIT=1 +fi + +for f in $DATAFRAME_CSV $PLAN_JSON +do + if ! [[ -f $f ]] + then + echo "$0: does not exist: $f" + exit 1 + fi +done + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME $WORKFLOW_ARGS \ + --plan_json=$PLAN_JSON \ + --dataframe_csv=$DATAFRAME_CSV \ + --benchmark_data=$BENCHMARK_DATA + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +OUTPUT=turbine-output/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +# Wait for job +queue_wait + +SCRIPT=$( basename $0 .sh ) +check_output "RESULTS:" $OUTPUT $WORKFLOW $SCRIPT $JOBID +check_output "EXIT CODE: 0" $OUTPUT $WORKFLOW $SCRIPT $JOBID + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: diff --git a/workflows/cp-leaveout/test/test-lambda-1.sh b/workflows/cp-leaveout/test/test-lambda-1.sh new file mode 100755 index 00000000..371f025d --- /dev/null +++ b/workflows/cp-leaveout/test/test-lambda-1.sh @@ -0,0 +1,83 @@ +#!/bin/bash +set -eu + +# CP LEAVEOUT TEST LAMBDA 1 + +SCRIPT=$( basename $0 .sh ) + +usage() +{ + echo "Usage: $0 SITE EXPID WORKFLOW_ARGS" +} + +if (( ${#} < 2 )) +then + usage + exit 1 +fi + +SITE=$1 +RUN_DIR=$2 +shift 2 +WORKFLOW_ARGS=$* + +SCRIPT=$( basename $0 .sh ) + +export MODEL_NAME=uno # nt3 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-512.sh +export CFG_PRM=$THIS/cfg-prm-1.sh + +# # Data files + +# Data files +CANDLE_DATA=$HOME/CANDLE_DATA_DIR/ChallengeProblem/top21_2020Jul +PLAN_JSON=$CANDLE_DATA/plangen_cell703-p4_drug1492-p1.json # NEW 2022-07 +# DATAFRAME_CSV=$CANDLE_DATA/topN.uno.h5 +# DATAFRAME_CSV=$CANDLE_DATA/top21.h5 # 2022-07 +# DATAFRAME_CSV=$CANDLE_DATA/top21-cleaned-dd.h5 # NEW 2022-10 +DATAFRAME_CSV=$CANDLE_DATA/top21-cleaned.h5 +# DATAFRAME_CSV=$CANDLE_DATA/top21_uno_v2.h5 +BENCHMARK_DATA=$CANDLE_DATA + +# What to return from the objective function (Keras model) +# val_loss (default), loss, and val_corr are supported +# export OBJ_RETURN="val_loss" +export OBJ_RETURN="loss" + +for f in $DATAFRAME_CSV $PLAN_JSON +do + if ! [[ -f $f ]] + then + echo "$0: does not exist: $f" + exit 1 + fi +done + +if [[ ! -e $BENCHMARK_DATA/cache ]] +then + echo "$0: The cache does not exist: $BENCHMARK_DATA/cache" + echo "$0: Use mkdir to create this directory" + exit 1 +fi + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME $WORKFLOW_ARGS \ + --plan_json=$PLAN_JSON \ + --dataframe_csv=$DATAFRAME_CSV \ + --benchmark_data=$BENCHMARK_DATA + +echo "$SCRIPT: OK" + +# Local Variables: +# c-basic-offset: 2; +# End: diff --git a/workflows/cp-leaveout/test/test-numpy-delete.py b/workflows/cp-leaveout/test/test-numpy-delete.py index e8161108..f6539091 100644 --- a/workflows/cp-leaveout/test/test-numpy-delete.py +++ b/workflows/cp-leaveout/test/test-numpy-delete.py @@ -1,7 +1,6 @@ - import numpy as np A = np.eye(4) print(A) -A = np.delete(A,1,axis=0) +A = np.delete(A, 1, axis=0) print(A) diff --git a/workflows/cp-leaveout/test/test-rn-1.sh b/workflows/cp-leaveout/test/test-rn-1.sh new file mode 100755 index 00000000..9fa3daf7 --- /dev/null +++ b/workflows/cp-leaveout/test/test-rn-1.sh @@ -0,0 +1,76 @@ +#!/bin/bash +set -eu + +# CP LEAVEOUT ResNet 1 + +usage() +{ + echo "Usage: test SITE EXPID EPOCH_MODE WORKFLOW_ARGS" + echo " EPOCH_MODE is one of the compute_epochs_*.swift modules." +} + +if (( ${#} < 3 )) +then + usage + exit 1 +fi + +SITE=$1 +RUN_DIR=$2 +EPOCH_MODE=$3 +shift 3 +WORKFLOW_ARGS=$* + +export MODEL_PYTHON_DIR=$HOME/proj/ai-apps +export MODEL_NAME=resnet50 + +# Self-configure +THIS=$( readlink --canonicalize $( dirname $0 ) ) +EMEWS_PROJECT_ROOT=$( readlink --canonicalize $THIS/.. ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( readlink --canonicalize $EMEWS_PROJECT_ROOT/.. ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-1.sh +export CFG_PRM=$THIS/cfg-prm-1.sh + +# What to return from the objective function (Keras model) +# val_loss (default), loss, and val_corr are supported +# export OBJ_RETURN="val_loss" +export OBJ_RETURN="loss" + +if [[ $SITE == "theta" ]] +then + export WAIT=1 +fi + +PLAN_JSON="" +DATAFRAME_CSV="" +BENCHMARK_DATA="" + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME $EPOCH_MODE $WORKFLOW_ARGS \ + --plan_json=$PLAN_JSON \ + --dataframe_csv=$DATAFRAME_CSV \ + --benchmark_data=$BENCHMARK_DATA + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +OUTPUT=turbine-output/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +# Wait for job +# queue_wait +exit + +SCRIPT=$( basename $0 .sh ) +check_output "RESULTS:" $OUTPUT $WORKFLOW $SCRIPT $JOBID +check_output "EXIT CODE: 0" $OUTPUT $WORKFLOW $SCRIPT $JOBID + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: diff --git a/workflows/cp1/README.adoc b/workflows/cp1/README.adoc index bd4b9d6b..913e2aea 100644 --- a/workflows/cp1/README.adoc +++ b/workflows/cp1/README.adoc @@ -3,13 +3,13 @@ The Mini Challenge Problem has 4 distinct stages: 1) preprocessing feature selection; 2) HPO optimization using mlrMBO; 3) additional training of the best models identified in 2; and 4) performing inference -on the trained models from 3. +on the trained models from 3. ==== Feature Selection Rather than using all of the ~17,000 genes for prediction of drug response for Uno, we used the COXEN approach (see the _xcorr_ README) to select genes for building the prediction model in a scenario of model adaptation between studies/datasets. The COXEN approach performs statistical analysis to identify predictive and generalizable genes for prediction. We have five public cancer cell line drug response datasets, i.e., NCI-60, CTRP, GDSC, CCLE, and gCSI. These five studies may have common cancer cell lines and drugs included in their experiments, but each of them may also include unique cell lines, drugs, or combinations of the two. Thus, the COXEN approach targeting genes that are both predictive and generalizable can be helpful in the model adaptation between studies. The approach produces _cross correleated feature set_ files that are used as input to the model. A preprocessed data file will have a file name like `CTRP_CCLE_2000_1000_features.txt` -where the two studies are `CTRP` and `CCLE`, and the cross correlation +where the two studies are `CTRP` and `CCLE`, and the cross correlation coefficients are 2000 and 1000 ==== HPO using mlrMBO @@ -43,7 +43,7 @@ develop branch. The Uno benchmarks is in `Benchmarks/Pilot1/Uno`. === Data *Download the raw Uno feature data to the `Benchmarks/Data/Pilot1` directory.* -Note the data is quite large so, depending on the file system and machine, a better +Note the data is quite large so, depending on the file system and machine, a better choice may be to download and symlink to `Benchmarks/Data/Pilot1`. ---- @@ -61,8 +61,8 @@ $ tar xfz candle-cp1-data.tgz ---- *Generate the input data frames from the raw data and the feature files.* -Uno runs much faster with these as input rather than the raw data. -A train and test data frame needs to be created for each cross correlated feature file. +Uno runs much faster with these as input rather than the raw data. +A train and test data frame needs to be created for each cross correlated feature file. For example, to generate the data associated with the `CTRP_CCLE_2000_1000.txt` feature file: @@ -72,25 +72,25 @@ $ python python uno_baseline_keras2.py --train_sources CTRP --cell_feature_subse $ python uno_baseline_keras2.py --train_sources CCLE --cell_feature_subset_path CTRP_CCLE_2000_1000_features.txt --no_feature_source True --no_response_source True --preprocess_rnaseq combat --export_data CTRP_CCLE_2000_1000_test.h5 ---- -Note that in the train data creation the train_source is CTRP and in the test data -creation the train source is CCLE. The creation of the training and test data can be -time consuming for the larger datasets. +Note that in the train data creation the train_source is CTRP and in the test data +creation the train source is CCLE. The creation of the training and test data can be +time consuming for the larger datasets. == Running the Workflows === HPO Workflow -. Edit `Supervisor/workflows/cp1/data/studies1.txt` and `Supervisor/workflows/cp1/data/studies2.txt`. -These two study files specify the cross correlation between studies where each study in `studies1.txt` -is cross correlated with each study in `studies2.txt` except where they are the same. Add or remove +. Edit `Supervisor/workflows/cp1/data/studies1.txt` and `Supervisor/workflows/cp1/data/studies2.txt`. +These two study files specify the cross correlation between studies where each study in `studies1.txt` +is cross correlated with each study in `studies2.txt` except where they are the same. Add or remove (or comment out) study names in these files to omit that cross-correlation from the HPO instances. For example, if study1 contains CTRP and study2 contains CCLE and GDSC, then the workflow will run two HPOs: one for the CTRP_CCLE and one for the CTRP_GDSC cross-correlations. . Create a set of _cfg_ and _test_ scripts for an experiment run by copying an existing set, i.e., `cfg-prm-1.sh`, `cfg-sys-1.sh`, and `test-1.sh` -from the `test` directory. Be sure to update the lines in your `test-N.sh` that +from the `test` directory. Be sure to update the lines in your `test-N.sh` that export the `cfg-prm-N.sh`, and `cfg-sys-N.sh` scripts to point to your _cfg_ scripts. Namely, + @@ -103,18 +103,18 @@ export CFG_PRM=$THIS/cfg-prm-N.sh See the comments in `cfg-prm-1.sh`, and `cfg-sys-1.sh` for additional information on the various configuration parameters and how to edit them. -. Launch the run using your `test-N.sh` script, passing SITE, and optional -experiment id as arguments (e.g., `./test-10.sh [expid]`) where +. Launch the run using your `test-N.sh` script, passing SITE, and optional +experiment id as arguments (e.g., `./test-10.sh [expid]`) where site can be one of local, cori, theta, summit etc. All the output from running the workflow will appear in a directory named with the experiment id, either your specified one or the auto-generated one (e.g. X001). Each Uno -model run launched by the mlrMBO instances runs in own directory: -`exp_id/run/W_X_Y_Z` where _W_ is the id of the mlrMBO instance that launched the run, _X_ is the restart number +model run launched by the mlrMBO instances runs in own directory: +`exp_id/run/W_X_Y_Z` where _W_ is the id of the mlrMBO instance that launched the run, _X_ is the restart number (almost always 1 here), _Y_ is the iteration of the mlrMBO instance, and _Z_ is the id of the hyper parameter set produced by mlrMBO instance _W_ and with which Uno was launched. -A summary of each Uno run, organized by mlrMBO instance and iteration, will be +A summary of each Uno run, organized by mlrMBO instance and iteration, will be output in `exp_id/hpo_log/X_Y_hpo_runs.txt` where _X_ is the mlrMBO instance id, and _Y_ is the mlrMBO instance's iteration. Each row of this hpo log output contains info for a single Uno run and has the following format: @@ -127,15 +127,15 @@ where the `|` character is the delimiter. * Configuration and launch scripts in `test/` (e.g. `cfg-prm-1.sh`, `test-1.sh`, etc.) * `swift/workflow.swift` - swift file that executes the workflow -* `swift/workflow.sh` - launch script for the swift file. This script is +* `swift/workflow.sh` - launch script for the swift file. This script is configured and launched from the scripts in `test/`. === Further Training Workflow (AKA the UPF workflow) -. Select N number of models from those produced by each HPO instance and +. Select N number of models from those produced by each HPO instance and create the input parameter file (i.e., the "upf" file). The hpo_log results -from the HPO workflow can be used for this. The "Combine HPO logs files +from the HPO workflow can be used for this. The "Combine HPO logs files adding hpo_id and iteration" code in `scripts/plots.R` is an example of how those logs can be concatenated together while adding the hpo_id and iteration as column values. The python jupyter notebook `script/cp1_scripts.ipynb` contains @@ -150,8 +150,8 @@ in the upf file contains the hyperparameters for an Uno run in JSON format. . Create a set of _cfg_ and _test_ scripts for an experiment run by copying an existing set, i.e., `cfg-prm-1.sh`, `cfg-sys-1.sh`, and `test-1.sh` -from the `test_upf/` directory. Note this is *NOT* the `test/` directory. Be sure -to update the lines in your `test-N.sh` that +from the `test_upf/` directory. Note this is *NOT* the `test/` directory. Be sure +to update the lines in your `test-N.sh` that export the `cfg-prm-N.sh`, and `cfg-sys-N.sh` scripts to point to your scripts. Namely, + @@ -164,16 +164,16 @@ export CFG_PRM=$THIS/cfg-prm-N.sh See the comments in `cfg-prm-1.sh`, and `cfg-sys-1.sh` for additional information on the various configuration parameters and how to edit them. -. Launch the run using your `test-N.sh` script, passing SITE, -and optional experiment id as arguments (e.g., `./test-10.sh [expid]`) where +. Launch the run using your `test-N.sh` script, passing SITE, +and optional experiment id as arguments (e.g., `./test-10.sh [expid]`) where site can be one of local, cori, theta, summit etc. All the output from running the workflow will appear in a directory named with the experiment id, either your specified one or the auto-generated one (e.g., X001). Each Uno -model run launched by the workflow runs in own directory: -`exp_id/run/X` where _X_ is the id of the run and corresponds to the index -of the line of input data that was used for that run, that is, `run/0` contains -the output for the run that ran with the 1st line from the upf input +model run launched by the workflow runs in own directory: +`exp_id/run/X` where _X_ is the id of the run and corresponds to the index +of the line of input data that was used for that run, that is, `run/0` contains +the output for the run that ran with the 1st line from the upf input file, `run/1` for the second line and so on. In addition, `inputs.txt` and `results.txt` files are also created. @@ -183,13 +183,13 @@ The first contains the parameters used for each run and the second final val los * Configuration and launch scripts in `test_upf/` (e.g., `cfg-prm-1.sh`, `test-1.sh`, etc.) * `swift/upf_workflow.swift` - swift file that executes the workflow -* `swift/upf_workflow.sh` - launch script for the swift file. This script is +* `swift/upf_workflow.sh` - launch script for the swift file. This script is configured and launched from the scripts in `test_upf/`. === Inference -. Create the inference parameter file. Each line of the inference parameter +. Create the inference parameter file. Each line of the inference parameter file contains the HPO parameters for a single inference run in csv format with the following columns @@ -199,16 +199,16 @@ the following columns For example, `CTRP_GDSC_2000_1000_test.h5,/gpfs/alpine/med106/scratch/ncollier/experiments/full_training_2/run/0/,CTRP_GDSC_2000_1000` + -The test data is part of the data generated as part of the data requirments (see above), -and found in the so-called CACHE_DIR directory as defined in the `cfg-prm-N.sh` files. The +The test data is part of the data generated as part of the data requirments (see above), +and found in the so-called CACHE_DIR directory as defined in the `cfg-prm-N.sh` files. The "directory of the trained model" is a directory that contains a model trained in the further - training workflow. The run label can be an informative label for the run. The python jupyter + training workflow. The run label can be an informative label for the run. The python jupyter notebook `script/cp1_scripts.ipynb` has some sample code for creating this parameter file. . Create a set of _cfg_ and _test_ scripts for an experiment run by copying an existing set, i.e., `cfg-prm-1.sh`, `cfg-sys-1.sh`, and `test-1.sh` -from the `test_infer/` directory. Note this is *NOT* the `test/` directory. -Be sure to update the lines in your `test-N.sh` that +from the `test_infer/` directory. Note this is *NOT* the `test/` directory. +Be sure to update the lines in your `test-N.sh` that export the `cfg-prm-N.sh`, and `cfg-sys-N.sh` scripts to point to your _cfg_ scripts. Namely, + @@ -226,28 +226,28 @@ information on the various configuration parameters and how to edit them. to create multiple copies of the input data to avoid IO contention. If this is unnecessary, then the `infer.sh` should not need to be changed. -. Launch the run using your `test-N.sh` script, passing SITE, and optional experiment id -as arguments (e.g., `./test-10.sh [expid]`) where +. Launch the run using your `test-N.sh` script, passing SITE, and optional experiment id +as arguments (e.g., `./test-10.sh [expid]`) where site can be one of local, cori, theta, summit etc. All the output from running the workflow will appear in a directory named with the experiment id, either your specified one or the auto-generated one. Each Uno -model inference run launched by the workflow runs in its own directory: -`exp_id/run/X` where _X_ is the id of the run and corresponds to the index of the -line of input data that was used for that run. So, `run/0` contains -the output for the run that ran with the 1st line from the input -file, `run/1` for the second line and so on. Each inference run will +model inference run launched by the workflow runs in its own directory: +`exp_id/run/X` where _X_ is the id of the run and corresponds to the index of the +line of input data that was used for that run. So, `run/0` contains +the output for the run that ran with the 1st line from the input +file, `run/1` for the second line and so on. Each inference run will produce an `uno_pred.all.tsv` and an `uno_pred.tsv` file. The first contains -the predictions for each feature and the second is an aggregate view +the predictions for each feature and the second is an aggregate view of the first. Additionally a `log.txt` file is created in the experiment directory -that contains the name of the data input file, the model, the output directory, +that contains the name of the data input file, the model, the output directory, and number of predictions performed for each inference run. ==== Associated Files * Configuration and launch scripts in `test_infer/` (e.g. `cfg-prm-1.sh`, `test-1.sh`, etc.) * `swift/infer_workflow.swift` - swift file that executes the workflow -* `swift/infer_workflow.sh` - launch script for the swift file. This script is +* `swift/infer_workflow.sh` - launch script for the swift file. This script is configured and launched from the scripts in `test_infer/`. * `sh/infer.sh` - script used to launch the Uno benchmark's `uno_infer.py` to perform the actual inference. diff --git a/workflows/cp1/data/upf_use_exported_no_nci.txt b/workflows/cp1/data/upf_use_exported_no_nci.txt index 6bc6e2b8..deecf6b7 100644 --- a/workflows/cp1/data/upf_use_exported_no_nci.txt +++ b/workflows/cp1/data/upf_use_exported_no_nci.txt @@ -18,4 +18,3 @@ {"study1": "CTRP", "epochs": 1, "batch_size": 6144, "use_exported" : 1 } {"study1": "gCSI", "epochs": 1, "batch_size": 6144, "use_exported" : 1 } {"study1": "GDSC", "epochs": 1, "batch_size": 6144, "use_exported" : 1 } - diff --git a/workflows/cp1/db/db-hpo-init.py b/workflows/cp1/db/db-hpo-init.py index 88a7e371..b7153cc4 100644 --- a/workflows/cp1/db/db-hpo-init.py +++ b/workflows/cp1/db/db-hpo-init.py @@ -1,22 +1,24 @@ - # DB HPO INIT PY # Initialize the SQLite DB for HPO # See db-hpo-init.sql for the table schema -import os, sys +import os +import sys + import yaml +from xcorr_db import q, xcorr_db -from xcorr_db import xcorr_db, q +DB = xcorr_db("xcorr.db", log=False) -DB = xcorr_db('xcorr.db', log=False) def create_tables(db_hpo_init_sql): - """ Set up the tables defined in the SQL file """ + """Set up the tables defined in the SQL file.""" with open(db_hpo_init_sql) as fp: sqlcode = fp.read() DB.executescript(sqlcode) DB.commit() + # def create_indices(): # """ Create indices after data insertion for speed """ # DB.execute("create index features_index on features(record_id);") @@ -31,6 +33,7 @@ def create_tables(db_hpo_init_sql): success = True except Exception as e: import traceback + print(traceback.format_exc()) if not success: diff --git a/workflows/cp1/db/db-hpo-list.py b/workflows/cp1/db/db-hpo-list.py index e9c88065..ef776165 100644 --- a/workflows/cp1/db/db-hpo-list.py +++ b/workflows/cp1/db/db-hpo-list.py @@ -1,7 +1,7 @@ - # DB HPO LIST -from xcorr_db import xcorr_db, q +from xcorr_db import q, xcorr_db + def list_hpos(): results = [] @@ -9,49 +9,58 @@ def list_hpos(): DB.execute(cmd) while True: row = DB.cursor.fetchone() - if row == None: break + if row == None: + break id, t = row[0:2] print("r") - results.append([id,t]) + results.append([id, t]) return results + def list_params(hpo_id): - """ hpo_id is a string here """ + """hpo_id is a string here.""" results = {} - cmd = "select param_id, name from hpo_hyperparam_defns " + \ - "where hpo_id=%s;" % hpo_id + cmd = ("select param_id, name from hpo_hyperparam_defns " + + "where hpo_id=%s;" % hpo_id) DB.execute(cmd) while True: row = DB.cursor.fetchone() - if row == None: break + if row == None: + break param_id, name = row[0:2] - results[param_id] = [ name ] + results[param_id] = [name] for param_id in results.keys(): values = list_values(param_id) results[param_id].append(values) return results + def list_values(param_id): - """ param_id is a string here """ + """param_id is a string here.""" results = [] - cmd = "select value_id, value from hpo_hyperparam_values " + \ - "where param_id=%s;" % param_id + cmd = ("select value_id, value from hpo_hyperparam_values " + + "where param_id=%s;" % param_id) DB.execute(cmd) while True: row = DB.cursor.fetchone() - if row == None: break + if row == None: + break value_id, value = row[0:2] - results.append([value_id,value]) + results.append([value_id, value]) return results + import argparse + parser = argparse.ArgumentParser(description="Query the DB.") parser.add_argument("--hpo", action="store", help="specify HPO ID") -parser.add_argument("--list-hpos", action="store_true", - help="list HPO IDs") -parser.add_argument("--list-params", action="store_true", +parser.add_argument("--list-hpos", action="store_true", help="list HPO IDs") +parser.add_argument("--list-params", + action="store_true", help="list hyperparameters") -parser.add_argument("-v", "--verbose", action="store_true", +parser.add_argument("-v", + "--verbose", + action="store_true", help="echo SQL statements") args = parser.parse_args() argv = vars(args) @@ -59,7 +68,8 @@ def list_values(param_id): if argv["verbose"]: print(str(args)) -DB = xcorr_db('xcorr.db', log=argv["verbose"]) +DB = xcorr_db("xcorr.db", log=argv["verbose"]) + def argv_hpo(): global argv @@ -68,6 +78,7 @@ def argv_hpo(): exit(1) return argv["hpo"] + if argv["list_hpos"]: entries = list_hpos() for entry in entries: diff --git a/workflows/cp1/db/db-hpo-setup.py b/workflows/cp1/db/db-hpo-setup.py index 36367654..a7de9ead 100644 --- a/workflows/cp1/db/db-hpo-setup.py +++ b/workflows/cp1/db/db-hpo-setup.py @@ -1,57 +1,64 @@ - # DB HPO SETUP -import os, sys +import os +import sys + import yaml +from xcorr_db import q, xcorr_db -from xcorr_db import xcorr_db, q +DB = xcorr_db("xcorr.db", log=True) -DB = xcorr_db('xcorr.db', log=True) def ensure_hpo_exists(hpo_id): - cmd = "select hpo_id from hpo_ids where hpo_id="+str(hpo_id)+";" + cmd = "select hpo_id from hpo_ids where hpo_id=" + str(hpo_id) + ";" DB.cursor.execute(cmd) while True: row = DB.cursor.fetchone() - if row == None: break + if row == None: + break print("Found in DB: hpo_id=" + str(hpo_id)) return import datetime - ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") DB.insert(table="hpo_ids", names=["hpo_id", "time"], - values=[q(hpo_id),q(ts)]) + values=[q(hpo_id), q(ts)]) print("SQL: created: hpo_id=" + str(hpo_id)) + def insert_hyperparam_defns(hpo_id, yamlfile): - """ - Copy hyperparameter definitions from YAML to SQL - """ + """Copy hyperparameter definitions from YAML to SQL.""" with open(yamlfile) as fp: s = fp.read() y = yaml.load(s) for hp in y: - print("hyperparameter '%s' has %2i values" % \ - (hp, len(y[hp]["values"]))) - param_id = DB.insert(table="hpo_hyperparam_defns", - names=["hpo_id", "name"], - values=[q(hpo_id), q(hp)]) + print("hyperparameter '%s' has %2i values" % (hp, len(y[hp]["values"]))) + param_id = DB.insert( + table="hpo_hyperparam_defns", + names=["hpo_id", "name"], + values=[q(hpo_id), q(hp)], + ) # print("param_id " + str(param_id)) values = y[hp]["values"] for p in values: print(" " + p) - DB.insert(table="hpo_hyperparam_values", - names=["param_id","value"], - values=[q(param_id),q(p)]) + DB.insert( + table="hpo_hyperparam_values", + names=["param_id", "value"], + values=[q(param_id), q(p)], + ) + def usage(): print("usage: db-hpo-setup ") + if len(sys.argv) != 3: usage() exit(1) -hpo_id = int(sys.argv[1]) +hpo_id = int(sys.argv[1]) yamlfile = sys.argv[2] # Catch and print all exceptions to improve visibility of success/failure @@ -62,6 +69,7 @@ def usage(): success = True except Exception as e: import traceback + print(traceback.format_exc()) if not success: diff --git a/workflows/cp1/db/hpo-defns-1.yaml b/workflows/cp1/db/hpo-defns-1.yaml index 99591daa..27b53ad3 100644 --- a/workflows/cp1/db/hpo-defns-1.yaml +++ b/workflows/cp1/db/hpo-defns-1.yaml @@ -1,15 +1,19 @@ - # HPO DEFNS 1 # Example for testing with values from Google Sheet "cp1" activation: description: activation - values: [ relu, tanh, sigmoid ] + values: [relu, tanh, sigmoid] dense_feature_layers: description: dense_feature_layers - values: [ "[1000, 1000, 1000]", - "[1000, 1000, 1000,1000]", "[1000, 1000, 1000, 1000, 1000]", - "[1000, 1000, 1000, 1000, 1000, 1000]", - "[2000, 2000, 2000]", - "[2000, 2000, 2000, 2000]", "[2000, 2000, 2000, 2000, 2000]", - "[2000, 2000, 2000, 2000, 2000, 2000]" ] + values: + [ + "[1000, 1000, 1000]", + "[1000, 1000, 1000,1000]", + "[1000, 1000, 1000, 1000, 1000]", + "[1000, 1000, 1000, 1000, 1000, 1000]", + "[2000, 2000, 2000]", + "[2000, 2000, 2000, 2000]", + "[2000, 2000, 2000, 2000, 2000]", + "[2000, 2000, 2000, 2000, 2000, 2000]", + ] diff --git a/workflows/cp1/nested_me_ex/.gitignore b/workflows/cp1/nested_me_ex/.gitignore index f6b35bb3..c0ee542e 100644 --- a/workflows/cp1/nested_me_ex/.gitignore +++ b/workflows/cp1/nested_me_ex/.gitignore @@ -1,4 +1,4 @@ *.tic __pycache__/ *.pyc -experiments/ \ No newline at end of file +experiments/ diff --git a/workflows/cp1/nested_me_ex/README.md b/workflows/cp1/nested_me_ex/README.md index d27aa813..2f2067ac 100644 --- a/workflows/cp1/nested_me_ex/README.md +++ b/workflows/cp1/nested_me_ex/README.md @@ -1,17 +1,17 @@ -## An Example Nested Model Exploration (ME) Workflow ## - -The workflow in `swift/workflow.swift` is a nested workflow where a *me1* resident -task provides parameters to any available *me2* resident tasks. The number of -*me2* resident tasks must be set before hand in `swift/workflow.sh` via -the *TURBINE_RESIDENT_WORK_WORKERS* variable. There must be at least 3 -*TURBINE_RESIDENT_WORK_WORKERS*: one for the *me1* resident task, one for -the *task_cache* (see below) resident task and one for an *me2* -resident task. Any more than 3 and the additional resident tasks are *me2* +## An Example Nested Model Exploration (ME) Workflow + +The workflow in `swift/workflow.swift` is a nested workflow where a _me1_ resident +task provides parameters to any available _me2_ resident tasks. The number of +_me2_ resident tasks must be set before hand in `swift/workflow.sh` via +the _TURBINE_RESIDENT_WORK_WORKERS_ variable. There must be at least 3 +_TURBINE_RESIDENT_WORK_WORKERS_: one for the _me1_ resident task, one for +the _task_cache_ (see below) resident task and one for an _me2_ +resident task. Any more than 3 and the additional resident tasks are _me2_ resident tasks. To run the workflow, edit `swift/workflow.sh` for your machine (i.e. edit -swift-t location, number of PROCS etc.), and -run. The script takes a single argument: an experiment id. So, ```./workflow.sh t1``` +swift-t location, number of PROCS etc.), and +run. The script takes a single argument: an experiment id. So, `./workflow.sh t1` Note that is work in progress and I have seen some seg faults when then entire workflow has finished. @@ -20,45 +20,44 @@ The implementation consists of two nested loops driven by these resident tasks. The overall flow looks like: 1. Initialization -2. The *me1* produces sets of parameters -3. Each parameter set is consumed by an *me2* instance -4. An *me2* instance produces parameters for model runs -5. After some number of model runs, the *me2* returns a result to the *me1* and we go back to step 2. - -Both loops are typical EMEWS style ME loops where some python code is intialized -with an *EQPy_init_package* and an *EQPy_run* (this latter call is new and custom -for this). For the *me1* we can see the initialization in line 133 and run in line 134. -The *me1* package is in `python/me1.py` which constains some dummy code +2. The _me1_ produces sets of parameters +3. Each parameter set is consumed by an _me2_ instance +4. An _me2_ instance produces parameters for model runs +5. After some number of model runs, the _me2_ returns a result to the _me1_ and we go back to step 2. + +Both loops are typical EMEWS style ME loops where some python code is intialized +with an _EQPy_init_package_ and an _EQPy_run_ (this latter call is new and custom +for this). For the _me1_ we can see the initialization in line 133 and run in line 134. +The _me1_ package is in `python/me1.py` which constains some dummy code to exercise the workflow. -The *me1* loop starts on line 151. The *EQPy_get* on line 157 produces the actual -parameters for the me2 to work on. THe *eqpy.OUT_put* on line 19 of -me1.py is what is sending these parameters from *me1.py*. +The _me1_ loop starts on line 151. The _EQPy_get_ on line 157 produces the actual +parameters for the me2 to work on. THe _eqpy.OUT_put_ on line 19 of +me1.py is what is sending these parameters from _me1.py_. -The *me1* loops runs an me2 instance in lines 180-181. +The _me1_ loops runs an me2 instance in lines 180-181. ```objc string free_rank = EQPy_get(cache_loc); results[j] = start_me2(p, i, j, free_rank); ``` -The *EQPy_get* call gets the rank of an available resident task that can -be used to run the me2. *start_me2* then runs the -me2 loop using that resident task. - -The placeholder me2 ME is implemented in `python/me2.py`. -As usual with EMEWS and like the *me1.py* above, this produces parameters and -passes them to swift for evaluation. The *eqpy.OUT_put(ps)* on line 32 in - `python/me2.py` produces the parameters and those parameters - are received by swift on line 72 in `swift/workflow.swift` in the *run_me2* - loop. Note that currently the *run_model* call on line 95 that receives these parameters - is just a placeholder. In the actual case, that would call the actual code to run the model. - - There's an additional swift resident task that runs the `python/task_cache.py` package. - This keeps track of which me2 resident tasks are available for work. MPI is used to - communicate between `task_cache` and `me2`. `task_cache` contains a list of MPI ranks - that can be used to run `me2` resident tasks. These ranks are pushed into an EQPY - queue where they can be retreived by the swift workflow. When an `me2` instance completes, its rank is pushed into the queue, indicating that that rank is now free for work. `task_cache.init_comm` and `me2.init` create an MPI communicator that they - use to communicate. I couldn't get this work without the back channel MPI. The code - seemed to deadlock at various points. If there's a better way, please let me know. - +The _EQPy_get_ call gets the rank of an available resident task that can +be used to run the me2. _start_me2_ then runs the +me2 loop using that resident task. + +The placeholder me2 ME is implemented in `python/me2.py`. +As usual with EMEWS and like the _me1.py_ above, this produces parameters and +passes them to swift for evaluation. The _eqpy.OUT_put(ps)_ on line 32 in +`python/me2.py` produces the parameters and those parameters +are received by swift on line 72 in `swift/workflow.swift` in the _run_me2_ +loop. Note that currently the _run_model_ call on line 95 that receives these parameters +is just a placeholder. In the actual case, that would call the actual code to run the model. + +There's an additional swift resident task that runs the `python/task_cache.py` package. +This keeps track of which me2 resident tasks are available for work. MPI is used to +communicate between `task_cache` and `me2`. `task_cache` contains a list of MPI ranks +that can be used to run `me2` resident tasks. These ranks are pushed into an EQPY +queue where they can be retreived by the swift workflow. When an `me2` instance completes, its rank is pushed into the queue, indicating that that rank is now free for work. `task_cache.init_comm` and `me2.init` create an MPI communicator that they +use to communicate. I couldn't get this work without the back channel MPI. The code +seemed to deadlock at various points. If there's a better way, please let me know. diff --git a/workflows/cp1/nested_me_ex/ext/EQ-Py/eqpy.py b/workflows/cp1/nested_me_ex/ext/EQ-Py/eqpy.py index e98e7f94..8b2f6618 100644 --- a/workflows/cp1/nested_me_ex/ext/EQ-Py/eqpy.py +++ b/workflows/cp1/nested_me_ex/ext/EQ-Py/eqpy.py @@ -1,6 +1,7 @@ -import threading +import importlib import sys -import importlib, traceback +import threading +import traceback EQPY_ABORT = "EQPY_ABORT" @@ -18,6 +19,7 @@ aborted = False wait_info = None + class WaitInfo: def __init__(self): @@ -28,6 +30,7 @@ def getWait(self): self.wait += 1 return self.wait + class InitializingThreadRunner(threading.Thread): def __init__(self, runnable): @@ -41,7 +44,7 @@ def run(self): except AttributeError: pass - + class ThreadRunner(threading.Thread): def __init__(self, runnable): @@ -56,20 +59,23 @@ def run(self): # tuple of type, value and traceback self.exc = traceback.format_exc() + def init(pkg): global p1, wait_info wait_info = WaitInfo() imported_pkg = importlib.import_module(pkg) - #print(pkg);sys.stdout.flush() + # print(pkg);sys.stdout.flush() p1 = InitializingThreadRunner(imported_pkg) p1.start() + def run(): global p2 p2 = ThreadRunner(p1.runnable) - #print(p.runnable);sys.stdout.flush() + # print(p.runnable);sys.stdout.flush() p2.start() + def output_q_get(): global output_q, aborted wait = wait_info.getWait() @@ -93,15 +99,19 @@ def output_q_get(): return result + import sys + def input_q_put(val): # print("q put {}".format(val));sys.stdout.flush() input_q.put(val) + def OUT_put(string_params): output_q.put(string_params) + def IN_get(): # global input_q result = input_q.get() diff --git a/workflows/cp1/nested_me_ex/python/me1.py b/workflows/cp1/nested_me_ex/python/me1.py index 4edbe111..d2febbca 100644 --- a/workflows/cp1/nested_me_ex/python/me1.py +++ b/workflows/cp1/nested_me_ex/python/me1.py @@ -1,9 +1,10 @@ -import eqpy import random +import eqpy # Generates parameters to be used by other MEs + def run(): # gets dummy params for this me params = eqpy.IN_get() @@ -12,15 +13,18 @@ def run(): for _ in range(10): op = [] for _ in range(5): - p = "{},{},{},{}".format(random.randint(1, 10), - random.randint(1, 10), random.randint(1, 10), - random.randint(1, 10)) + p = "{},{},{},{}".format( + random.randint(1, 10), + random.randint(1, 10), + random.randint(1, 10), + random.randint(1, 10), + ) op.append(p) - + ps = ";".join(op) eqpy.OUT_put(ps) # wait to get result back eqpy.IN_get() - + eqpy.OUT_put("DONE") eqpy.OUT_put("final result") diff --git a/workflows/cp1/nested_me_ex/python/me2.py b/workflows/cp1/nested_me_ex/python/me2.py index ea59f57e..acdc8386 100644 --- a/workflows/cp1/nested_me_ex/python/me2.py +++ b/workflows/cp1/nested_me_ex/python/me2.py @@ -1,30 +1,36 @@ -import eqpy, sys +import sys + +import eqpy from mpi4py import MPI + def printf(s): print(s) sys.stdout.flush() + cache_comm = None + def init(): global cache_comm ranks_str = eqpy.IN_get() - ranks = ranks_str.split(',')[1:] - #print(ranks) + ranks = ranks_str.split(",")[1:] + # print(ranks) if cache_comm == None: comm = MPI.COMM_WORLD group = comm.Get_group() cache_group = group.Incl([int(x) for x in ranks]) - #printf("ME newgroup size is {}".format(cache_group.size)) - cache_comm = comm.Create_group(cache_group,1) + # printf("ME newgroup size is {}".format(cache_group.size)) + cache_comm = comm.Create_group(cache_group, 1) + def run(): # my swift-t MPI comm rank, and destination rank for cache_comm rank = eqpy.IN_get() - #printf("AL Start on {}".format(rank)) + # printf("AL Start on {}".format(rank)) param = eqpy.IN_get() - + for _ in range(10): op = [param] * 5 ps = ";".join(op) @@ -34,10 +40,5 @@ def run(): eqpy.OUT_put("DONE") eqpy.OUT_put("42") - data = {'msg' : 'put', 'rank' : rank} + data = {"msg": "put", "rank": rank} cache_comm.send(data, dest=0, tag=1) - - - - - diff --git a/workflows/cp1/nested_me_ex/python/task_cache.py b/workflows/cp1/nested_me_ex/python/task_cache.py index 08965c82..00fab96f 100644 --- a/workflows/cp1/nested_me_ex/python/task_cache.py +++ b/workflows/cp1/nested_me_ex/python/task_cache.py @@ -1,38 +1,42 @@ -import eqpy import sys + +import eqpy from mpi4py import MPI + def printf(s): print(s) sys.stdout.flush() + def init_comm(ranks): - comm = MPI.COMM_WORLD + comm = MPI.COMM_WORLD group = comm.Get_group() cache_group = group.Incl([int(x) for x in ranks]) - #printf("Cache Group size is {}".format(cache_group.size)) - return comm.Create_group(cache_group,1) + # printf("Cache Group size is {}".format(cache_group.size)) + return comm.Create_group(cache_group, 1) + def run(): ranks_str = eqpy.IN_get() - ranks = ranks_str.split(',') + ranks = ranks_str.split(",") # include only the al ranks task_ranks = ranks[2:] - + for r in task_ranks: eqpy.OUT_put(r) # include self and tasks in comm comm = init_comm(ranks[1:]) rank = comm.rank - #printf("task cache rank: {}".format(rank)) + # printf("task cache rank: {}".format(rank)) while True: - status = MPI.Status() + status = MPI.Status() data = comm.recv(source=MPI.ANY_SOURCE, status=status) - msg = data['msg'] - if msg == 'put': + msg = data["msg"] + if msg == "put": # this is its rank in the swift mpi communicator - eqpy.OUT_put(data['rank']) - elif msg == 'DONE': + eqpy.OUT_put(data["rank"]) + elif msg == "DONE": break diff --git a/workflows/cp1/nested_me_ex/swift/workflow.sh b/workflows/cp1/nested_me_ex/swift/workflow.sh index f92b57f1..30809b8a 100755 --- a/workflows/cp1/nested_me_ex/swift/workflow.sh +++ b/workflows/cp1/nested_me_ex/swift/workflow.sh @@ -86,4 +86,4 @@ set -x swift-t -n $PROCS $MACHINE -p -r$EQPy -I $EQPy \ -e MPICH_MAX_THREAD_SAFETY=$MPICH_MAX_THREAD_SAFETY \ -e PYTHONPATH=$PYTHONPATH \ - $EMEWS_PROJECT_ROOT/swift/workflow.swift \ No newline at end of file + $EMEWS_PROJECT_ROOT/swift/workflow.swift diff --git a/workflows/cp1/nested_me_ex/swift/workflow.swift b/workflows/cp1/nested_me_ex/swift/workflow.swift index 3fb47d81..31dde226 100644 --- a/workflows/cp1/nested_me_ex/swift/workflow.swift +++ b/workflows/cp1/nested_me_ex/swift/workflow.swift @@ -54,7 +54,7 @@ int CACHE_RANK_IDX = 1; EQPy_run(me2_location) => EQPy_put(me2_location, me2_rank) => EQPy_put(me2_location, params) => - run_me2(me2_location, iter, param_id, me2_rank) => + run_me2(me2_location, iter, param_id, me2_rank) => // get fake results from ME2 run result = get_result(); } @@ -91,7 +91,7 @@ int CACHE_RANK_IDX = 1; foreach p, j in param_array { // TODO update run_model with code to actually - // run the model with the parameters + // run the model with the parameters // produced from the active learning. results[j] = run_model(p, i, j); } @@ -105,7 +105,7 @@ int CACHE_RANK_IDX = 1; (void o) init_tasks_cache() { rank = r_ranks[CACHE_RANK_IDX]; location loc = locationFromRank(string2int(rank)); - EQPy_init_package(loc, "task_cache") => + EQPy_init_package(loc, "task_cache") => EQPy_run(loc) => EQPy_put(loc, join(r_ranks, ",")) => o = propagate(); @@ -122,7 +122,7 @@ int CACHE_RANK_IDX = 1; foreach i in [2 : size(r_ranks) - 1] { init_me2_rank(r_ranks[i]); waiter[i] = r_ranks[i]; - } + } } (void o) start() { @@ -171,7 +171,7 @@ int CACHE_RANK_IDX = 1; string param_array[] = split(params, ";"); string results[]; // printf("%i", size(param_array)); - // Lauch an me2 run for each set of parameters produced by + // Lauch an me2 run for each set of parameters produced by // me1 foreach p, j in param_array { diff --git a/workflows/cp1/scripts/parse_infer_results.py b/workflows/cp1/scripts/parse_infer_results.py index c98562fd..c45ac7ef 100644 --- a/workflows/cp1/scripts/parse_infer_results.py +++ b/workflows/cp1/scripts/parse_infer_results.py @@ -1,82 +1,113 @@ - -import sys import csv +import datetime +import json import subprocess -import datetime, json +import sys from os import path + import numpy as np - #mse: 0.2190,min,max,std - #mae: 0.3251 - #r2: 0.4320 - #corr: 0.6584 +# mse: 0.2190,min,max,std +# mae: 0.3251 +# r2: 0.4320 +# corr: 0.6584 def grep(infer_log): - output = subprocess.check_output(['grep', '-E', "mse:|mae:|r2:|corr:", infer_log]) - lines = output.decode("utf-8").strip().split('\n') + output = subprocess.check_output( + ["grep", "-E", "mse:|mae:|r2:|corr:", infer_log]) + lines = output.decode("utf-8").strip().split("\n") # print(lines) result = [np.nan] * 16 - # id, start, end, train time, epochs + # id, start, end, train time, epochs for line in lines: line = line.strip() - if line.startswith('mse:') and line.find(',') != -1: + if line.startswith("mse:") and line.find(",") != -1: l = line[5:] - result[0], result[1], result[2], result[3] = [float(x) for x in l.split(',')] - elif line.startswith('mae:') and line.find(',') != -1: + result[0], result[1], result[2], result[3] = [ + float(x) for x in l.split(",") + ] + elif line.startswith("mae:") and line.find(",") != -1: l = line[5:] - result[4], result[5], result[6], result[7] = [float(x) for x in l.split(',')] - elif line.startswith('r2') and line.find(',') != -1: + result[4], result[5], result[6], result[7] = [ + float(x) for x in l.split(",") + ] + elif line.startswith("r2") and line.find(",") != -1: l = line[3:] - result[8], result[9], result[10], result[11] = [float(x) for x in l.split(',')] - elif line.startswith('corr') and line.find(',') != -1: + result[8], result[9], result[10], result[11] = [ + float(x) for x in l.split(",") + ] + elif line.startswith("corr") and line.find(",") != -1: l = line[6:] - result[12], result[13], result[14], result[15] = [float(x) for x in l.split(',')] + result[12], result[13], result[14], result[15] = [ + float(x) for x in l.split(",") + ] # print(result) return result + def create_params_map(training_file): param_map = {} with open(training_file) as f_in: reader = csv.reader(f_in, delimiter="|") for r in reader: params = json.loads(r[2]) - save_path = params['save_path'] - if save_path[-1] == '/': + save_path = params["save_path"] + if save_path[-1] == "/": save_path = save_path[:-1] param_map[save_path] = params - + return param_map def main(infer_log, training_file, out_file): param_map = create_params_map(training_file) - with open(out_file, 'w') as f_out: + with open(out_file, "w") as f_out: writer = csv.writer(f_out) - writer.writerow(['infer_id', 'model_class', 'instance_directory', 'params', 'model_path', - 'mse_mean', 'mse_std', 'mse_min', 'mse_max', - 'mae_mean', 'mae_std', 'mae_min', 'mae_max', - 'r2_mean', 'r2_std', 'r2_min', 'r2_max', - 'corr_mean', 'corr_std', 'corr_min', 'corr_max']) + writer.writerow([ + "infer_id", + "model_class", + "instance_directory", + "params", + "model_path", + "mse_mean", + "mse_std", + "mse_min", + "mse_max", + "mae_mean", + "mae_std", + "mae_min", + "mae_max", + "r2_mean", + "r2_std", + "r2_min", + "r2_max", + "corr_mean", + "corr_std", + "corr_min", + "corr_max", + ]) with open(infer_log) as f_in: - reader = csv.reader(f_in, delimiter='|') + reader = csv.reader(f_in, delimiter="|") # model class|data file|model|instance_dir for i, row in enumerate(reader): if i % 1000 == 0: - print('ROW: {}'.format(i)) + print("ROW: {}".format(i)) model_class = row[0] instance_dir = row[3] model_dir = path.dirname(row[2]) params = param_map[model_dir] - stats = grep('{}/infer.log'.format(instance_dir)) + stats = grep("{}/infer.log".format(instance_dir)) if not np.isnan(stats[0]): - result = [i, model_class, instance_dir, params, row[2]] + stats + result = [i, model_class, instance_dir, params, row[2] + ] + stats writer.writerow(result) else: - print("{}|{}|{}|{}".format(row[0], row[1], row[2], row[3], row[4])) + print("{}|{}|{}|{}".format(row[0], row[1], row[2], row[3], + row[4])) -if __name__ == '__main__': - # inference log file (e.g. infer_all_4/log.txt), training file (e.g. full_training_2/inputs.txt), output file, +if __name__ == "__main__": + # inference log file (e.g. infer_all_4/log.txt), training file (e.g. full_training_2/inputs.txt), output file, main(sys.argv[1], sys.argv[2], sys.argv[3]) diff --git a/workflows/cp1/scripts/parse_start_stop.py b/workflows/cp1/scripts/parse_start_stop.py index 879467d5..9fdca7e0 100644 --- a/workflows/cp1/scripts/parse_start_stop.py +++ b/workflows/cp1/scripts/parse_start_stop.py @@ -1,17 +1,17 @@ -import sys import csv -import subprocess import datetime import os - +import subprocess +import sys from operator import itemgetter -TIME_FORMAT='%Y/%m/%d %H:%M:%S' +TIME_FORMAT = "%Y/%m/%d %H:%M:%S" START = 0 STOP = 1 + def create_counts(timings_file, out_dir): - hpos = {'all' : []} + hpos = {"all": []} with open(timings_file) as f_in: reader = csv.reader(f_in) for row in reader: @@ -21,12 +21,12 @@ def create_counts(timings_file, out_dir): hpos[hpo_id].append(line) else: hpos[hpo_id] = [line] - hpos['all'].append(line) + hpos["all"].append(line) for k in hpos: hpos[k] = sorted(hpos[k], key=itemgetter(0)) - counts = {'all' : []} + counts = {"all": []} for k in hpos: count = 0 for ts, ev in hpos[k]: @@ -34,53 +34,53 @@ def create_counts(timings_file, out_dir): count += 1 else: count -= 1 - + if k in counts: counts[k].append([ts, count]) else: counts[k] = [[ts, count]] - + for k in counts: - with open('{}/{}_counts.csv'.format(out_dir, k), 'w') as f_out: + with open("{}/{}_counts.csv".format(out_dir, k), "w") as f_out: writer = csv.writer(f_out) for item in counts[k]: writer.writerow(item) - def grep(model_log): - output = subprocess.check_output(['grep', '-E', "RUN START|RUN STOP", model_log]) + output = subprocess.check_output( + ["grep", "-E", "RUN START|RUN STOP", model_log]) lines = output.decode("utf-8") result = [] - for line in lines.split('\n'): - idx = line.find(' __main') + for line in lines.split("\n"): + idx = line.find(" __main") if idx != -1: ts = line[0:idx] dt = datetime.datetime.strptime(ts, TIME_FORMAT).timestamp() - if line.endswith('START'): + if line.endswith("START"): result.append((dt, START)) else: result.append((dt, STOP)) - + return result def main(hpos_file, out_file): - with open(out_file, 'w') as f_out: + with open(out_file, "w") as f_out: with open(hpos_file) as f_in: - reader = csv.reader(f_in, delimiter='|') + reader = csv.reader(f_in, delimiter="|") for i, row in enumerate(reader): if i % 1000 == 0: - print('ROW: {}'.format(i)) + print("ROW: {}".format(i)) # hpo_id = row[1] run_dir = row[3] rd = os.path.basename(run_dir) - hpo_id = rd[:rd.find('_')] - result = grep('{}/model.log'.format(run_dir)) + hpo_id = rd[:rd.find("_")] + result = grep("{}/model.log".format(run_dir)) for r in result: - f_out.write('{},{},{}\n'.format(hpo_id, r[0], r[1])) + f_out.write("{},{},{}\n".format(hpo_id, r[0], r[1])) if __name__ == "__main__": - #main(sys.argv[1], sys.argv[2]) + # main(sys.argv[1], sys.argv[2]) create_counts(sys.argv[1], sys.argv[2]) diff --git a/workflows/cp1/scripts/parse_start_stop_upf.py b/workflows/cp1/scripts/parse_start_stop_upf.py index 0b2c7d03..4268905e 100644 --- a/workflows/cp1/scripts/parse_start_stop_upf.py +++ b/workflows/cp1/scripts/parse_start_stop_upf.py @@ -1,15 +1,17 @@ -import sys import csv +import datetime +import json import subprocess -import datetime, json +import sys from os import path -TIME_FORMAT='%Y/%m/%d %H:%M:%S' +TIME_FORMAT = "%Y/%m/%d %H:%M:%S" START = 0 STOP = 1 + def create_counts(timings_file): - hpos = {'all' : []} + hpos = {"all": []} with open(timings_file) as f_in: reader = csv.reader(f_in) for row in reader: @@ -18,12 +20,12 @@ def create_counts(timings_file): hpos[hpo_id].append(row[1:]) else: hpos[hpo_id] = [row[1:]] - hpos['all'].append(row[1:]) + hpos["all"].append(row[1:]) for k in hpos: sorted(hpos[k], itemgetter(0)) - counts = {'all' : []} + counts = {"all": []} for k in hpos: count = 0 for ts, ev in hpos[k]: @@ -31,7 +33,7 @@ def create_counts(timings_file): count += 1 else: count -= 1 - + if k in counts: counts[k].append([ts, count]) else: @@ -39,67 +41,73 @@ def create_counts(timings_file): def grep(model_log, rid, model_name): - output = subprocess.check_output(['grep', '-E', "RUN START|RUN STOP", model_log]) + output = subprocess.check_output( + ["grep", "-E", "RUN START|RUN STOP", model_log]) lines = output.decode("utf-8") # id, start, end, train time, epochs result = [int(rid), model_name, -1, -1, -1, -1] complete = False - for line in lines.split('\n'): - idx = line.find(' __main') + for line in lines.split("\n"): + idx = line.find(" __main") if idx != -1: ts = line[0:idx] dt = datetime.datetime.strptime(ts, TIME_FORMAT).timestamp() - if line.endswith('START'): + if line.endswith("START"): result[2] = dt - elif line.endswith('STOP'): + elif line.endswith("STOP"): result[3] = dt complete = True # Current time ....1888.599 # Epoch 2/100 - output = subprocess.check_output(['grep', '-E', "Current time", model_log]) - lines = output.decode("utf-8").strip().split('\n') + output = subprocess.check_output(["grep", "-E", "Current time", model_log]) + lines = output.decode("utf-8").strip().split("\n") line = lines[-1] - ct = line[line.rfind(' ....') + len(' ....') : ].strip() + ct = line[line.rfind(" ....") + len(" ...."):].strip() result[4] = float(ct) - - output = subprocess.check_output(['grep', '-E', "Epoch", model_log]) - lines = output.decode("utf-8").strip().split('\n') + + output = subprocess.check_output(["grep", "-E", "Epoch", model_log]) + lines = output.decode("utf-8").strip().split("\n") if complete: line = lines[-1] else: line = lines[-2] - epochs = line[line.find(' ') : line.find('/') ] + epochs = line[line.find(" "):line.find("/")] result[5] = int(epochs) - return result + def write_results(results): - with open('timings.txt', 'w') as f_out: - + with open("timings.txt", "w") as f_out: + result = results[hpo_id] for r in result: for i in r: - f_out.write('{} {}\n'.format(i[0], i[1])) + f_out.write("{} {}\n".format(i[0], i[1])) + def main(hpos_file, out_file): results = {} - with open(out_file, 'w') as f_out: + with open(out_file, "w") as f_out: writer = csv.writer(f_out) - writer.writerow(['upf_id', 'model_name', 'start_ts', 'end_ts', 'total_train_time', 'epochs']) + writer.writerow([ + "upf_id", "model_name", "start_ts", "end_ts", "total_train_time", + "epochs" + ]) with open(hpos_file) as f_in: - reader = csv.reader(f_in, delimiter='|') + reader = csv.reader(f_in, delimiter="|") for i, row in enumerate(reader): if i % 1000 == 0: - print('ROW: {}'.format(i)) + print("ROW: {}".format(i)) upf_id = row[0] params = json.loads(row[2]) - bname = path.basename(params['use_exported_data']) - model_name = bname[ : bname.find('.')] - run_dir = params['save_path'] - result = grep('{}/model.log'.format(run_dir), upf_id, model_name) + bname = path.basename(params["use_exported_data"]) + model_name = bname[:bname.find(".")] + run_dir = params["save_path"] + result = grep("{}/model.log".format(run_dir), upf_id, + model_name) writer.writerow(result) diff --git a/workflows/cp1/scripts/plots.R b/workflows/cp1/scripts/plots.R index 89c12f18..f9fefafe 100644 --- a/workflows/cp1/scripts/plots.R +++ b/workflows/cp1/scripts/plots.R @@ -52,10 +52,10 @@ ggplot (se, aes(x=start, y=hpo_id)) + geom_segment( xend=se$end, yend=se$hpo_id, size = 3 - ) + + ) + xlab('time (minutes)') + ylab('hpo id') + - scale_x_continuous(limits = c(0, max(se$end))) + scale_x_continuous(limits = c(0, max(se$end))) ft <- fread("~/Documents/results/cp1/train_upf_timings.csv") @@ -63,7 +63,7 @@ ft$time_per_epoch <- ft$total_train_time / ft$epochs fwrite(ft, file="~/Documents/results/cp1/train_upf_timings.csv", row.names = F) agg_ft <- ft[, .(min(total_train_time), max(total_train_time), mean(total_train_time), sd(total_train_time), - min(epochs), max(epochs), mean(epochs), sd(epochs), + min(epochs), max(epochs), mean(epochs), sd(epochs), min(time_per_epoch), max(time_per_epoch), mean(time_per_epoch), sd(time_per_epoch)), by=model_name] setnames(agg_ft, c("model_name", "min_train_time", "max_train_time", "mean_train_time", "std_train_time", "min_epochs", "max_epochs", "mean_epochs", "std_epochs", "min_time_per_epoch", "max_time_per_epoch", "mean_time_per_epoch", "std_time_per_epoch")) @@ -72,7 +72,7 @@ fwrite(agg_ft, file="~/Documents/results/cp1/agg_timings_by_model.csv", row.name idt <- fread("~/Documents/results/cp1/inference_results.csv") agg_idt <- idt[, .(min(r2), max(r2), mean(r2), sd(r2), - min(mae), max(mae), mean(mae), sd(mae), + min(mae), max(mae), mean(mae), sd(mae), min(mse), max(mse), mean(mse), sd(mse)), by=model_class] setnames(agg_idt, c("model_class", "min_r2", "max_r2", "mean_r2", "std_r2", "min_mae", "max_mae", "mean_mae", "std_mae", "min_mse", "max_mse", "mean_mse", "std_mse")) @@ -92,7 +92,7 @@ dts = list() i = 1 for (f in fs) { - hpo_dt <- fread(f, col.names = c("run_id", 'xcorr_record_id', 'params', 'instance_dir', 'timestamp', 'val_loss'), + hpo_dt <- fread(f, col.names = c("run_id", 'xcorr_record_id', 'params', 'instance_dir', 'timestamp', 'val_loss'), sep="|") fname = basename(f) vals <- strsplit(fname, "_", fixed=T) @@ -105,7 +105,7 @@ for (f in fs) { results_dir <- '~/Documents/results/cp1/nci_hpo_log/' fs <- Sys.glob(paste0(results_dir, '/*_hpo_runs.txt')) for (f in fs) { - hpo_dt <- fread(f, col.names = c("run_id", 'xcorr_record_id', 'params', 'instance_dir', 'timestamp', 'val_loss'), + hpo_dt <- fread(f, col.names = c("run_id", 'xcorr_record_id', 'params', 'instance_dir', 'timestamp', 'val_loss'), sep="|") fname = basename(f) vals <- strsplit(fname, "_", fixed=T) @@ -130,5 +130,3 @@ ggplot(data=hpos[val_loss < 1e+03], mapping=aes(x=iteration, y=val_loss)) + ylab("Val Loss (log scale)") + scale_y_continuous(trans='log10') + facet_wrap(~ hpo_id, ncol=5) - - diff --git a/workflows/cp1/sh/infer.sh b/workflows/cp1/sh/infer.sh index 750c1fee..2ea1c7de 100755 --- a/workflows/cp1/sh/infer.sh +++ b/workflows/cp1/sh/infer.sh @@ -16,7 +16,7 @@ then exit 1 fi -INSTANCE_DIRECTORY=$1 +INSTANCE_DIRECTORY=$1 DF="$2" MODEL_FILE="$3" N_PRED=$4 diff --git a/workflows/cp1/swift/infer_workflow.sh b/workflows/cp1/swift/infer_workflow.sh index be860652..3f5a05bc 100755 --- a/workflows/cp1/swift/infer_workflow.sh +++ b/workflows/cp1/swift/infer_workflow.sh @@ -102,7 +102,7 @@ mkdir -pv $TURBINE_OUTPUT/data # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh diff --git a/workflows/cp1/swift/infer_workflow.swift b/workflows/cp1/swift/infer_workflow.swift index d821adfb..65037628 100644 --- a/workflows/cp1/swift/infer_workflow.swift +++ b/workflows/cp1/swift/infer_workflow.swift @@ -40,7 +40,7 @@ string n_pred = argv("n_pred"); */ app (void o) run_model (string model_sh, string instance_dir, string data_file, string model_file, string run_id) { - // 1 2 3 4 5 + // 1 2 3 4 5 "bash" model_sh instance_dir data_file model_file n_pred run_id; } @@ -75,4 +75,3 @@ main() { write_lines(inputs, "log.txt"); } - diff --git a/workflows/cp1/swift/nci_workflow.swift b/workflows/cp1/swift/nci_workflow.swift index 47a85eea..6739e095 100644 --- a/workflows/cp1/swift/nci_workflow.swift +++ b/workflows/cp1/swift/nci_workflow.swift @@ -80,7 +80,7 @@ cache_dir = '%s' if len(cell_feature_subset_path) > 0: params['cell_feature_subset_path'] = cell_feature_subset_path # GDSC_NCI60_1600_800_features.txt - # GDSC_NCI60_2000_1000.h5 + # GDSC_NCI60_2000_1000.h5 import os ex_data_f = os.path.basename(params['cell_feature_subset_path']) idx = ex_data_f.rfind('_features') @@ -130,7 +130,7 @@ record_id = DB.insert_xcorr_record(studies=studies, """; -(string hpo_id) insert_hpo(string xcorr_record_id) +(string hpo_id) insert_hpo(string xcorr_record_id) { hpo_template = """ @@ -145,7 +145,7 @@ hpo_id = DB.insert_hpo_record(%s) hpo_id = python_persist(code, "str(hpo_id)"); } -(string run_id) insert_hpo_run(string hpo_id, string param_string, string run_directory) +(string run_id) insert_hpo_run(string hpo_id, string param_string, string run_directory) { run_template = """ @@ -160,7 +160,7 @@ run_id = DB.insert_hpo_run(%s, '%s', '%s') run_id = python_persist(code, "str(run_id)"); } -(void o) update_hpo_run(string run_id, string result) +(void o) update_hpo_run(string run_id, string result) { update_template = """ @@ -267,7 +267,7 @@ uno_xcorr.coxen_feature_selection(study1, study2, } else { results[j] = result; } - + // update_hpo_run(run_db_id, results[j]); // TODO DB: insert result with record_id } diff --git a/workflows/cp1/swift/upf_workflow.sh b/workflows/cp1/swift/upf_workflow.sh index beed8286..929dfada 100755 --- a/workflows/cp1/swift/upf_workflow.sh +++ b/workflows/cp1/swift/upf_workflow.sh @@ -70,7 +70,7 @@ export APP_PYTHONPATH=$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$XCORR_ROOT export TURBINE_JOBNAME="JOB:${EXPID}" -if [ -z ${GPU_STRING+x} ]; +if [ -z ${GPU_STRING+x} ]; then GPU_ARG="" else @@ -108,7 +108,7 @@ mkdir -pv $XCORR_DATA_DIR # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh diff --git a/workflows/cp1/swift/upf_workflow.swift b/workflows/cp1/swift/upf_workflow.swift index 4e778176..b3ddd1a5 100644 --- a/workflows/cp1/swift/upf_workflow.swift +++ b/workflows/cp1/swift/upf_workflow.swift @@ -110,7 +110,7 @@ main() { //make_dir(instance) => { string param_code = update_param_template % (params, instance); //printf(param_code); - + updated_param = python_persist(param_code, "params_json"); inputs[i] = "%i|%f|%s" % (i, clock(), updated_param); string result = obj(updated_param, int2string(i)) => @@ -121,4 +121,3 @@ main() { write_lines(inputs, "inputs.txt"); write_lines(results, "results.txt"); } - diff --git a/workflows/cp1/swift/workflow.sh b/workflows/cp1/swift/workflow.sh index e487c621..ce5052ec 100755 --- a/workflows/cp1/swift/workflow.sh +++ b/workflows/cp1/swift/workflow.sh @@ -152,7 +152,7 @@ mkdir -pv $TURBINE_OUTPUT/hpo_log # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh diff --git a/workflows/cp1/swift/workflow.swift b/workflows/cp1/swift/workflow.swift index 48388aa6..62693846 100644 --- a/workflows/cp1/swift/workflow.swift +++ b/workflows/cp1/swift/workflow.swift @@ -80,7 +80,7 @@ cell_feature_subset_path = '%s' if len(cell_feature_subset_path) > 0: params['cell_feature_subset_path'] = cell_feature_subset_path # GDSC_NCI60_1600_800_features.txt - # GDSC_NCI60_2000_1000.h5 + # GDSC_NCI60_2000_1000.h5 import os ex_data_f = os.path.basename(params['cell_feature_subset_path']) idx = ex_data_f.rfind('_features') @@ -132,7 +132,7 @@ record_id = DB.insert_xcorr_record(studies=studies, """; -(string hpo_id) insert_hpo(string xcorr_record_id) +(string hpo_id) insert_hpo(string xcorr_record_id) { hpo_template = """ @@ -147,7 +147,7 @@ hpo_id = DB.insert_hpo_record(%s) hpo_id = python_persist(code, "str(hpo_id)"); } -(string run_id) insert_hpo_run(string hpo_id, string param_string, string run_directory) +(string run_id) insert_hpo_run(string hpo_id, string param_string, string run_directory) { run_template = """ @@ -162,7 +162,7 @@ run_id = DB.insert_hpo_run(%s, '%s', '%s') run_id = python_persist(code, "str(run_id)"); } -(void o) update_hpo_run(string run_id, string result) +(void o) update_hpo_run(string run_id, string result) { update_template = """ diff --git a/workflows/cp1/test/cfg-prm-1.sh b/workflows/cp1/test/cfg-prm-1.sh index 659d8f82..15a87a05 100644 --- a/workflows/cp1/test/cfg-prm-1.sh +++ b/workflows/cp1/test/cfg-prm-1.sh @@ -41,7 +41,7 @@ export DRUG_REPSONSE_DATA=$BENCHMARKS_ROOT/Data/Pilot1/rescaled_combined_single_ # Location of mlrMBO input file. uno_quick_test is # appropriate for testing PARAM_SET_FILE=$EMEWS_PROJECT_ROOT/data/uno_quick_test.R -# Actual useful mlrMBO input file for uno: uno_hpo.R +# Actual useful mlrMBO input file for uno: uno_hpo.R # PARAM_SET_FILE=$EMEWS_PROJECT_ROOT/data/uno_hpo.R if [[ "${PARAM_SET_FILE:-}" == "" ]]; then diff --git a/workflows/cp1/test/cfg-sys-1.sh b/workflows/cp1/test/cfg-sys-1.sh index 11fb3fa5..d7e2f118 100644 --- a/workflows/cp1/test/cfg-sys-1.sh +++ b/workflows/cp1/test/cfg-sys-1.sh @@ -2,7 +2,7 @@ # MLRMBO CFG SYS 1 # The total number of MPI processes including 2 -# for swift internals, and the number of +# for swift internals, and the number of # mlrMBO instances and the number of individual # Uno HPO runs. export PROCS=${PROCS:-6} diff --git a/workflows/cp1/test/cfg-sys-3.sh b/workflows/cp1/test/cfg-sys-3.sh index 6c30a103..d6f4718b 100644 --- a/workflows/cp1/test/cfg-sys-3.sh +++ b/workflows/cp1/test/cfg-sys-3.sh @@ -17,7 +17,7 @@ export PPN=${PPN:-1} # For Theta: # export QUEUE=${QUEUE:-debug-flat-quad} # export QUEUE=R.candle -export QUEUE=default +export QUEUE=default export WALLTIME=${WALLTIME:-01:59} diff --git a/workflows/cp1/test/create-new-test.sh b/workflows/cp1/test/create-new-test.sh index 42c82669..83a6858b 100755 --- a/workflows/cp1/test/create-new-test.sh +++ b/workflows/cp1/test/create-new-test.sh @@ -18,6 +18,3 @@ sed -i -e "s/PROPOSE_POINTS:-5/PROPOSE_POINTS:-$1/g" cfg-prm-$1.sh sed -i -e "s/MAX_CONCURRENT_EVALUATIONS:-1/MAX_CONCURRENT_EVALUATIONS:-$1/g" cfg-prm-$1.sh sed -i -e "s/DESIGN_SIZE:-10/DESIGN_SIZE:-$1/g" cfg-prm-$1.sh sed -i -e "s/MAX_BUDGET:-180/MAX_BUDGET:-$Budget/g" cfg-prm-$1.sh - - - diff --git a/workflows/cp1/test_infer/cfg-prm-1.sh b/workflows/cp1/test_infer/cfg-prm-1.sh index 82de21fd..b232b1df 100644 --- a/workflows/cp1/test_infer/cfg-prm-1.sh +++ b/workflows/cp1/test_infer/cfg-prm-1.sh @@ -10,5 +10,3 @@ XCORR_DATA_DIR=$EMEWS_PROJECT_ROOT/xcorr_data UPF_FILE=$EMEWS_PROJECT_ROOT/data/infer_upf.txt # Number of predictions to make for each inference runs N_PRED=30 - - diff --git a/workflows/cp1/test_infer/cfg-prm-250.sh b/workflows/cp1/test_infer/cfg-prm-250.sh index 2404db84..0ff1d57a 100644 --- a/workflows/cp1/test_infer/cfg-prm-250.sh +++ b/workflows/cp1/test_infer/cfg-prm-250.sh @@ -5,4 +5,3 @@ CACHE_DIR=$EMEWS_PROJECT_ROOT/cache XCORR_DATA_DIR=$EMEWS_PROJECT_ROOT/xcorr_data UPF_FILE=$EMEWS_PROJECT_ROOT/data/infer_upf_a.txt N_PRED=30 - diff --git a/workflows/cp1/test_upf/cfg-prm-1.sh b/workflows/cp1/test_upf/cfg-prm-1.sh index 3b498310..e11a9178 100644 --- a/workflows/cp1/test_upf/cfg-prm-1.sh +++ b/workflows/cp1/test_upf/cfg-prm-1.sh @@ -8,4 +8,3 @@ XCORR_DATA_DIR=$EMEWS_PROJECT_ROOT/xcorr_data # Location of the input file that contains the parameters for each run # 1 per row UPF_FILE=$EMEWS_PROJECT_ROOT/data/upf.txt - diff --git a/workflows/csg/swift/workflow.sh b/workflows/csg/swift/workflow.sh new file mode 100755 index 00000000..bffb2625 --- /dev/null +++ b/workflows/csg/swift/workflow.sh @@ -0,0 +1,118 @@ +#! /usr/bin/env bash +set -eu + +# CMP-CV WORKFLOW SH + +# Autodetect this workflow directory +export CANDLE_PROJECT_ROOT=$( realpath $( dirname $0 )/.. ) +export WORKFLOWS_ROOT=$( realpath $CANDLE_PROJECT_ROOT/.. ) + +SCRIPT_NAME=$(basename $0) + +# Source some utility functions used in this script +source $WORKFLOWS_ROOT/common/sh/utils.sh + +usage() +{ + echo "CMP-CV: usage: workflow.sh SITE EXPID CFG_SYS PLAN" +} + +if (( ${#} != 5 )) +then + usage + exit 1 +fi + +if ! { + # Sets SITE + # Sets EXPID, TURBINE_OUTPUT + # Sets CFG_SYS + # PLAN is the hyperparameter list file + get_site $1 && \ + get_expid $2 && \ + get_cfg_sys $3 && \ + UPF=$4 + MODELS=$5 + } +then + usage + exit 1 +fi + +source_site env $SITE +source_site sched $SITE + +# Set up PYTHONPATH for model +source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh + +export PYTHONPATH="${PYTHONPATH}:/homes/ac.gpanapitiya/ccmg-mtg/models/to_Candle/DrugCell" +export PYTHONPATH="${PYTHONPATH}:/homes/ac.gpanapitiya/ccmg-mtg/models/to_Candle/SWnet" +export PYTHONPATH="${PYTHONPATH}:$WORKFLOWS_ROOT/cmp-cv/py" +log_path PYTHONPATH + +export TURBINE_JOBNAME="CMP_${EXPID}" + +export MODEL_SH=${MODEL_SH:-$WORKFLOWS_ROOT/common/sh/model.sh} +export BENCHMARK_TIMEOUT +PLAN="PLAN_NOT_DEFINED" +CMD_LINE_ARGS=( -expid=$EXPID + -benchmark_timeout=$BENCHMARK_TIMEOUT + -plan=$PLAN + -models=$MODELS + -gparams=$UPF + ) + +USER_VARS=( $CMD_LINE_ARGS ) +# log variables and script to to TURBINE_OUTPUT directory +log_script + +# Copy settings to TURBINE_OUTPUT for provenance +cp $CFG_SYS $TURBINE_OUTPUT + +# Make run directory in advance to reduce contention +mkdir -pv $TURBINE_OUTPUT/run + +cp -v $UPF $TURBINE_OUTPUT + +# TURBINE_STDOUT="$TURBINE_OUTPUT/out-%%r.txt" +TURBINE_STDOUT= + +if [[ ${CANDLE_DATA_DIR:-} == "" ]] +then + abort "cmp-cv workflow.sh: Set CANDLE_DATA_DIR!" +fi + +export CANDLE_IMAGE=${CANDLE_IMAGE:-} + +export CANDLE_MODEL_IMPL=container + +which swift-t + +swift-t -n $PROCS \ + -o $TURBINE_OUTPUT/workflow.tic \ + ${MACHINE:-} \ + -p \ + -I $WORKFLOWS_ROOT/common/swift \ + -i model_$CANDLE_MODEL_IMPL \ + -e BENCHMARKS_ROOT \ + -e CANDLE_PROJECT_ROOT \ + -e MODEL_SH \ + -e FI_MR_CACHE_MAX_COUNT=0 \ + -e SITE \ + -e BENCHMARK_TIMEOUT \ + -e MODEL_NAME=${MODEL_NAME:-MODEL_NULL} \ + -e OBJ_RETURN \ + -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ + -e TURBINE_MPI_THREAD=${TURBINE_MPI_THREAD:-1} \ + $( python_envs ) \ + -e TURBINE_STDOUT=$TURBINE_STDOUT \ + -e CANDLE_MODEL_TYPE \ + -e CANDLE_IMAGE \ + $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} + +# Can provide this to debug Python settings: +# -e PYTHONVERBOSE=1 +# Can provide this if needed for debugging crashes: +# -e PYTHONUNBUFFERED=1 +# Can provide this if needed to reset PATH: +# -e PATH=$PATH diff --git a/workflows/csg/swift/workflow.swift b/workflows/csg/swift/workflow.swift new file mode 100644 index 00000000..555cc798 --- /dev/null +++ b/workflows/csg/swift/workflow.swift @@ -0,0 +1,59 @@ + +/** + CMP-CV WORKFLOW.SWIFT +*/ + +import assert; +import io; +import json; +import files; +import string; +import sys; + +import candle_utils; +// report_env(); + +string FRAMEWORK = "pytorch"; + +// Scan command line +// file plan = input(argv("plan")); +file model_file = input(argv("models")); +file gparams_file = input(argv("gparams")); +int benchmark_timeout = string2int(argv("benchmark_timeout", "-1")); + +string expid = getenv("EXPID"); +string turbine_output = getenv("TURBINE_OUTPUT"); +string candle_model_type = getenv("CANDLE_MODEL_TYPE"); + +// Report some key facts: +printf("Cross-Validation: %s", filename(model_file)); +system1("date \"WORKFLOW START: +%Y-%m-%d %H:%M\""); + +// Read unrolled parameter file +// string plan_lines[] = file_lines(plan); +string model_lines[] = file_lines(model_file); + +string gparams_lines[] = file_lines(gparams_file); + +// Resultant output values: +string results[]; + +foreach gparam, j in gparams_lines +{ + // runid = i*1000000 + j; + runid = j; + + printf("runid: %s", runid); + printf("gparams: %s", gparam); + + model_name = json_get(gparam, "model_name"); + candle_image = json_get(gparam, "candle_image"); + model_script = "train.sh"; + + printf("MODEL: %s", model_name); + // TODO: Add preprocessing script + // results[runid] = + model_script_preprocess = "preprocess.sh"; + obj_container(gparam, expid, repr(runid), model_name, candle_image, model_script_preprocess)=>obj_container(gparam, expid, repr(runid), model_name, candle_image, model_script); + // TODO: Add inference script or loop to do multiple inferences on a trained model +} diff --git a/workflows/csg/test/cfg-sys-1.sh b/workflows/csg/test/cfg-sys-1.sh new file mode 100644 index 00000000..5158b14a --- /dev/null +++ b/workflows/csg/test/cfg-sys-1.sh @@ -0,0 +1,27 @@ + +# CMP-CV CFG SYS 1 + +# Use 1 for interactive workflows +# export INTERACTIVE=1 + +# The number of MPI processes +# Note that 1 process is reserved for Swift/T +# For example, if PROCS=4 that gives you 3 workers, +# i.e., 3 concurrent Keras runs. +export PROCS=${PROCS:-2} + +# MPI processes per node. This should not exceed PROCS. +# Cori has 32 cores per node, 128GB per node +export PPN=${PPN:-2} + +#export QUEUE=${QUEUE:-batch} + +export WALLTIME=${WALLTIME:-1:00:00} +echo WALLTIME: $WALLTIME + +# export MAIL_ENABLED=1 +# export MAIL_ADDRESS=woz@anl.gov + +# Benchmark run timeout: benchmark run will timeouT +# after the specified number of seconds. -1 is no timeout. +BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} diff --git a/workflows/csg/test/make-upf-1.sh b/workflows/csg/test/make-upf-1.sh new file mode 100755 index 00000000..eb5c0049 --- /dev/null +++ b/workflows/csg/test/make-upf-1.sh @@ -0,0 +1,26 @@ +#!/bin/zsh + + + +OUTPUT=$1 + +# Use ZSH for range operation + +EPOCHS_MIN=10 +EPOCHS_MAX=20 +BATCH_SIZE_MIN=5 +BATCH_SIZE_MAX=7 + + +for EPOCHS in {$EPOCHS_MIN..$EPOCHS_MAX} +do + for BATCH_SIZE in {$BATCH_SIZE_MIN..$BATCH_SIZE_MAX} + do + BS2=$(( 2 ** BATCH_SIZE )) + echo "{" + echo "\"epochs\": $EPOCHS," + echo "\"batch_size\": $BATCH_SIZE," + echo "MORE_PARAMS" + echo "}" + done +done > $OUTPUT diff --git a/workflows/csg/test/test-polaris.sh b/workflows/csg/test/test-polaris.sh new file mode 100755 index 00000000..35539d25 --- /dev/null +++ b/workflows/csg/test/test-polaris.sh @@ -0,0 +1,29 @@ +#!/bin/bash +set -eu + +# CMP-CV TEST SMALL 1 + +if (( ${#} != 1 )) +then + echo "usage: test SITE" + exit 1 +fi + +# export MODEL_NAME=$1 +SITE=$1 + +# Self-configure +THIS=$( cd $( dirname $0 ) ; /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. ; /bin/pwd ) +WORKFLOWS_ROOT=$( cd $THIS/../.. ; /bin/pwd ) +export EMEWS_PROJECT_ROOT + +export OBJ_RETURN="val_loss" +CFG_SYS=$THIS/cfg-sys-1.sh + +# export MODEL_NAME="DrugCell" +# export CANDLE_IMAGE=/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif +export CANDLE_MODEL_TYPE="SINGULARITY" + +# model-1.txt is not used currently +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $THIS/upf-graphdrp-polaris.txt $THIS/models-1.txt diff --git a/workflows/csg/test/test-small-1.sh b/workflows/csg/test/test-small-1.sh new file mode 100755 index 00000000..22fc8222 --- /dev/null +++ b/workflows/csg/test/test-small-1.sh @@ -0,0 +1,28 @@ +#!/bin/bash +set -eu + +# CMP-CV TEST SMALL 1 + +if (( ${#} != 1 )) +then + echo "usage: test SITE" + exit 1 +fi + +# export MODEL_NAME=$1 +SITE=$1 + +# Self-configure +THIS=$( cd $( dirname $0 ) ; /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. ; /bin/pwd ) +WORKFLOWS_ROOT=$( cd $THIS/../.. ; /bin/pwd ) +export EMEWS_PROJECT_ROOT + +export OBJ_RETURN="val_loss" +CFG_SYS=$THIS/cfg-sys-1.sh + +# export MODEL_NAME="DrugCell" +# export CANDLE_IMAGE=/homes/ac.gpanapitiya/ccmg-mtg/Singularity/DrugCell.sif +export CANDLE_MODEL_TYPE="SINGULARITY" + +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $THIS/upf-1.txt $THIS/models-1.txt diff --git a/workflows/csg/test/upf-1.txt b/workflows/csg/test/upf-1.txt new file mode 100644 index 00000000..6cbf5a43 --- /dev/null +++ b/workflows/csg/test/upf-1.txt @@ -0,0 +1,4 @@ +{"id": "RUN001", "batch_size": 16, "epochs": 4, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} +{"id": "RUN002", "batch_size": 32, "epochs": 3, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} +{"id": "RUN003", "batch_size": 64, "epochs": 2, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} +{"id": "RUN004", "batch_size": 128, "epochs": 1, "model_name": "GraphDRP", "candle_image": "/software/improve/images/GraphDRP.sif"} diff --git a/workflows/csg/test/upf-graphdrp-polaris.txt b/workflows/csg/test/upf-graphdrp-polaris.txt new file mode 100644 index 00000000..59113d20 --- /dev/null +++ b/workflows/csg/test/upf-graphdrp-polaris.txt @@ -0,0 +1,4 @@ +{"id": "RUN001", "batch_size": 16, "epochs": 4, "model_name": "GraphDRP", "candle_image": "/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif"} +{"id": "RUN002", "batch_size": 32, "epochs": 3, "model_name": "GraphDRP", "candle_image": "/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif"} +{"id": "RUN003", "batch_size": 64, "epochs": 2, "model_name": "GraphDRP", "candle_image": "/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif"} +{"id": "RUN004", "batch_size": 128, "epochs": 1, "model_name": "GraphDRP", "candle_image": "/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif"} diff --git a/workflows/dense-noise/swift/workflow.sh b/workflows/dense-noise/swift/workflow.sh new file mode 100755 index 00000000..48b646c3 --- /dev/null +++ b/workflows/dense-noise/swift/workflow.sh @@ -0,0 +1,174 @@ +#! /usr/bin/env bash +set -eu + +# DENSE NOISE WORKFLOW +# Main entry point for DENSE-NOISE workflow +# See README.adoc for more information + +# Autodetect this workflow directory +export EMEWS_PROJECT_ROOT=$( cd $( dirname $0 )/.. ; /bin/pwd ) +export WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. ; /bin/pwd ) +if [[ ! -d $EMEWS_PROJECT_ROOT/../../../Benchmarks ]] +then + echo "Could not find Benchmarks in: $EMEWS_PROJECT_ROOT/../../../Benchmarks" + exit 1 +fi +BENCHMARKS_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd) +export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} +BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/NT3 +export BENCHMARK_TIMEOUT +export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} + +SCRIPT_NAME=$(basename $0) + +# Source some utility functions used by EMEWS in this script +source $WORKFLOWS_ROOT/common/sh/utils.sh + +usage() +{ + echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME" +} + +if (( ${#} != 5 )) +then + usage + exit 1 +fi + +if ! { + get_site $1 # Sets SITE + get_expid $2 # Sets EXPID + get_cfg_sys $3 + get_cfg_prm $4 + MODEL_NAME=$5 + } +then + usage + exit 1 +fi + +echo "workflow.sh start: MODEL_NAME=$MODEL_NAME" + +source_site env $SITE +source_site sched $SITE + +# Set PYTHONPATH for BENCHMARK related stuff +PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # needed for model_runner + +export TURBINE_JOBNAME="${EXPID}" + +if [[ ${CANDLE_DATA_DIR:-} == "" ]] +then + echo "workflow.sh: CANDLE_DATA_DIR is not set!" + exit 1 +fi + +if [ -z ${GPU_STRING+x} ]; +then + GPU_ARG="" +else + GPU_ARG="-gpus=$GPU_STRING" +fi + +mkdir -pv $TURBINE_OUTPUT + +# Set up PYTHONPATH for model +source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh + +CMD_LINE_ARGS=( -benchmark_timeout=$BENCHMARK_TIMEOUT + -exp_id=$EXPID + -site=$SITE + ) + +USER_VARS=( $CMD_LINE_ARGS ) +# log variables and script to to TURBINE_OUTPUT directory +log_script + +# Make run directory in advance to reduce contention +mkdir -pv $TURBINE_OUTPUT/run +mkdir -pv $TURBINE_OUTPUT/data + +# CANDLE_MODEL_IMPL: "container" on Polaris, "py" on Summit/Frontier +CANDLE_MODEL_IMPL="container" + +# Allow the user to set an objective function +SWIFT_LIBS_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} +SWIFT_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} +# This is used by the obj_app objective function +export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh + +# log_path PYTHONPATH + +WORKFLOW_SWIFT=${WORKFLOW_SWIFT:-workflow.swift} +echo "WORKFLOW_SWIFT: $WORKFLOW_SWIFT" + +WAIT_ARG="" +if (( ${WAIT:-0} )) +then + WAIT_ARG="-t w" + echo "Turbine will wait for job completion." +fi + +# Output handline +mkdir -pv $TURBINE_OUTPUT/out +if [[ ${MACHINE:-} == "" ]] +then + STDOUT=$TURBINE_OUTPUT/output.txt + # The turbine-output link is only created on scheduled systems, + # so if running locally, we create it here so the test*.sh wrappers + # can find it + [[ -L turbine-output ]] && rm turbine-output + ln -s $TURBINE_OUTPUT turbine-output +else + # When running on a scheduled system, Swift/T automatically redirects + # stdout to the turbine-output directory. + # Some systems do % interpretation in environment variables, + # we escape them in TURBINE_STDOUT here: + if [[ $SITE == "summit" ]] || \ + [[ $SITE == "biowulf" ]] || \ + [[ $SITE == "polaris" ]] + then + : # export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" + else + : # export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%r.txt" + fi + STDOUT="" +fi + +cd $TURBINE_OUTPUT +cp $CFG_SYS $CFG_PRM $TURBINE_OUTPUT + +swift-t -n $PROCS \ + ${MACHINE:-} \ + -p \ + -I $SWIFT_LIBS_DIR \ + -i $SWIFT_MODULE \ + -e LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} \ + -e TURBINE_STDOUT \ + -e BENCHMARKS_ROOT \ + -e EMEWS_PROJECT_ROOT \ + -e APP_PYTHONPATH=$APP_PYTHONPATH \ + $( python_envs ) \ + -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ + -e MODEL_RETURN \ + -e CANDLE_DATA_DIR \ + -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ + -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ + -e MODEL_SH \ + -e MODEL_NAME \ + -e SITE \ + -e BENCHMARK_TIMEOUT \ + -e BENCHMARKS_ROOT \ + -e SH_TIMEOUT \ + -e IGNORE_ERRORS \ + $WAIT_ARG \ + $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} 2>&1 | \ + tee $STDOUT + +if (( ${PIPESTATUS[0]} )) +then + echo "workflow.sh: swift-t exited with error!" + exit 1 +fi + +echo "JOB OK" | tee -a $STDOUT diff --git a/workflows/dense-noise/swift/workflow.swift b/workflows/dense-noise/swift/workflow.swift new file mode 100644 index 00000000..9be7e372 --- /dev/null +++ b/workflows/dense-noise/swift/workflow.swift @@ -0,0 +1,72 @@ +/* + DENSE NOISE WORKFLOW SWIFT + Main workflow +*/ + +import assert; +import files; +import io; +import python; +import unix; +import sys; +import string; +import location; +import math; + +string FRAMEWORK = "keras"; + +string xcorr_root = getenv("XCORR_ROOT"); +string preprocess_rnaseq = getenv("PREPROP_RNASEQ"); +string emews_root = getenv("EMEWS_PROJECT_ROOT"); +string turbine_output = getenv("TURBINE_OUTPUT"); + +string exp_id = argv("exp_id"); +int benchmark_timeout = toint(argv("benchmark_timeout", "-1")); +string model_name = getenv("MODEL_NAME"); + +printf("DENSE NOISE WORKFLOW.SWIFT"); +printf("TURBINE_OUTPUT: " + turbine_output); +printf("model_name: " + model_name); + +int epochs = 10; + +int neurons[] = [500:1000:50]; + +float y_num_noises = 11; // Number of noise levels to try +float y_noise_levels[] = [0:y_num_noises]; +float noise_step = 5; // Difference between noises + +int num_trials = 5; +int trials[] = [0:num_trials-1]; + +config = "/usr/local/Benchmarks/Pilot1/Uno/uno_auc_model.txt"; + +json_template = """ +{ "layer_force": %4i, + "noise" : %5.2f, + "epochs" : %2i, + "config" : "%s", + "experiment_id": "%s", + "run_id": "%s", + "candle_result": "r2", + "ckpt_save_interval": 1 +} +"""; + +foreach neuron in neurons +{ + foreach levely, j in y_noise_levels + { + foreach trial, k in trials + { + y_noise_level = levely * noise_step; + run_id = "%04i-%05.2f-%02i" % (neuron, y_noise_level, trial); + params = json_template % + (neuron, y_noise_level, epochs, config, exp_id, run_id); + printf("running: %s: %s", run_id, params); + result = candle_model_train(params, exp_id, run_id, model_name); + printf("result %s : neuron %i y_noise %0.3f : %s", + run_id, neuron, y_noise_level, result); + } + } +} diff --git a/workflows/dense-noise/test/cfg-prm-1.sh b/workflows/dense-noise/test/cfg-prm-1.sh new file mode 100644 index 00000000..b44914a5 --- /dev/null +++ b/workflows/dense-noise/test/cfg-prm-1.sh @@ -0,0 +1,4 @@ + +# CFG PRM 1 + +# Empty diff --git a/workflows/dense-noise/test/cfg-sys-small.sh b/workflows/dense-noise/test/cfg-sys-small.sh new file mode 100644 index 00000000..38189a98 --- /dev/null +++ b/workflows/dense-noise/test/cfg-sys-small.sh @@ -0,0 +1,45 @@ + +# CFG SYS SMALL + +# The number of MPI processes +# Note that 1 processes is reserved for Swift +# The default of 2 gives you 1 worker, i.e., 1 concurrent Python +export PROCS=${PROCS:-2} + +# MPI processes per node +export PPN=${PPN:-2} + +export WALLTIME=${WALLTIME:-00:05:00} + +# CANDLE@ALCF: +# export PROJECT=CSC249ADOA01 +export PROJECT=swift-t-polaris +# export QUEUE="debug" # Up to 2 nodes +# export QUEUE="debug-scaling" # Up to 10 nodes +export QUEUE="prod" # At least 10 nodes + + +# CANDLE@OLCF: +# export PROJECT=MED106 +# export QUEUE=batch + +# Benchmark run timeout: benchmark run will timeout +# after the specified number of seconds. +# If set to -1 there is no timeout. +# This timeout is implemented with Keras callbacks +BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} + +# Shell timeout: benchmark run will be killed +# after the specified number of seconds. +# If set to -1 or empty there is no timeout. +# This timeout is implemented with the shell command 'timeout' +export SH_TIMEOUT=${SH_TIMEOUT:-} + +# Ignore errors: If 1, unknown errors will be reported to model.log +# but will not bring down the Swift workflow. See model.sh . +export IGNORE_ERRORS=0 + +if [[ ${SITE} == "summit" ]] +then + export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" +fi diff --git a/workflows/dense-noise/test/test-1.sh b/workflows/dense-noise/test/test-1.sh new file mode 100755 index 00000000..fb6314ae --- /dev/null +++ b/workflows/dense-noise/test/test-1.sh @@ -0,0 +1,54 @@ +#!/bin/bash +set -eu + +# DENSE NOISE TEST 1 + +usage() +{ + echo "Usage: test MODEL_NAME SITE RUN_DIR" + echo " RUN_DIR: use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 3 )) +then + export MODEL_NAME=$1 + SITE=$2 + RUN_DIR=$3 +else + usage + exit 1 +fi + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh +SCRIPT=$( basename $0 .sh ) + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-small.sh +# export CFG_SYS=$THIS/cfg-sys-big.sh +export CFG_PRM=$THIS/cfg-prm-1.sh + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export MODEL_RETURN="val_loss" + +export CANDLE_MODEL_TYPE="BENCHMARKS" + +if [[ $SITE == "theta" ]] +then + export WAIT=1 +fi + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME + +echo "$SCRIPT: OK" + +# Local Variables: +# c-basic-offset: 2; +# End: diff --git a/workflows/grid/README.md b/workflows/grid/README.md index 247ae873..5c0d11b2 100644 --- a/workflows/grid/README.md +++ b/workflows/grid/README.md @@ -1,14 +1,18 @@ # Simple parameter sweep with Swift, using command line programs + **run** runs **run-sweep.swift**, which runs a parameter sweep. It calls command-line programs as follows: + - determineParameters.{sh,py}: Read data/ **settings.json** for sweep parameters, and return as a string for use by Swift program - evaluateOne.{sh,py}: Runs a single experiment. (Calls p1b1_baseline). - computeStats.{sh,py}: Ingests data from all of the experiments and computes simple stats. -Usage: ./run +Usage: ./run Notes: -- **settings.json**: sweep parameters. Parameters must be labeled "1", "2", "3", "4", ... -1: epochs + +- **settings.json**: sweep parameters. Parameters must be labeled "1", "2", "3", "4", ... + 1: epochs + 2. batch_size 3. N1 -4. NE \ No newline at end of file +4. NE diff --git a/workflows/grid/data/settings.json b/workflows/grid/data/settings.json index acb160ba..01a65814 100644 --- a/workflows/grid/data/settings.json +++ b/workflows/grid/data/settings.json @@ -1,11 +1,10 @@ { - "parameters": - { - "epochs": [4, 6, 8 ], - "batch_size": [30, 40], - "N1": [1500], - "NE": [600], - "latent_dim": [2, 8, 16, 32, 64], - "learning_rate": [0.00001, 0.0001, 0.001, 0.1] - } + "parameters": { + "epochs": [4, 6, 8], + "batch_size": [30, 40], + "N1": [1500], + "NE": [600], + "latent_dim": [2, 8, 16, 32, 64], + "learning_rate": [0.00001, 0.0001, 0.001, 0.1] + } } diff --git a/workflows/grid/python/computeStats.py b/workflows/grid/python/computeStats.py index f414c378..4e33ee1b 100644 --- a/workflows/grid/python/computeStats.py +++ b/workflows/grid/python/computeStats.py @@ -1,40 +1,44 @@ +import json +import os import sys from collections import defaultdict -import json, os + def extractVals(A): B = defaultdict(dict) A1 = A.split() for n, val in zip(A1[0::2], A1[1::2]): B[n] = float(val) - return(B) + return B + def computeStats(swiftArrayAsString): A = extractVals(swiftArrayAsString) vals = [] for a in A: vals += [A[a]] - print('%d values, with min=%f, max=%f, avg=%f\n'%(len(vals),min(vals),max(vals),sum(vals)/float(len(vals)))) - - filename = os.environ['TURBINE_OUTPUT']+ "/final_stats.txt" - # writing the val loss to the output file - with open(filename, 'w') as the_file: - the_file.write('%d values, with min=%f, max=%f, avg=%f\n'%(len(vals),min(vals),max(vals),sum(vals)/float(len(vals)))) + print("%d values, with min=%f, max=%f, avg=%f\n" % + (len(vals), min(vals), max(vals), sum(vals) / float(len(vals)))) + filename = os.environ["TURBINE_OUTPUT"] + "/final_stats.txt" + # writing the val loss to the output file + with open(filename, "w") as the_file: + the_file.write( + "%d values, with min=%f, max=%f, avg=%f\n" % + (len(vals), min(vals), max(vals), sum(vals) / float(len(vals)))) -if (len(sys.argv) < 2): - print('requires arg=dataFilename') - sys.exit(1) +if len(sys.argv) < 2: + print("requires arg=dataFilename") + sys.exit(1) dataFilename = sys.argv[1] try: - with open(dataFilename, 'r') as the_file: + with open(dataFilename, "r") as the_file: data = the_file.read() except IOError as e: print("Could not open: %s" % dataFilename) print("PWD is: '%s'" % os.getcwd()) computeStats(data) - diff --git a/workflows/grid/python/determineParameters.py b/workflows/grid/python/determineParameters.py index 574c4819..ad85a36f 100644 --- a/workflows/grid/python/determineParameters.py +++ b/workflows/grid/python/determineParameters.py @@ -1,7 +1,10 @@ -import sys, json, os +import json +import os +import sys # ===== Definitions ========================================================= + def loadSettings(settingsFilename): print("Reading settings: %s" % settingsFilename) try: @@ -15,44 +18,54 @@ def loadSettings(settingsFilename): epochs = settings['parameters']["epochs"] batch_size = settings['parameters']["batch_size"] N1 = settings['parameters']["N1"] - NE = settings['parameters']["NE"] - latent_dim = settings['parameters']["latent_dim"] - learning_rate = settings['parameters']["learning_rate"] + NE = settings['parameters']["NE"] + latent_dim = settings['parameters']["latent_dim"] + learning_rate = settings['parameters']["learning_rate"] except KeyError as e: - print("Settings file (%s) does not contain key: %s" % (settingsFilename, str(e))) + print("Settings file (%s) does not contain key: %s" % + (settingsFilename, str(e))) sys.exit(1) - return(epochs, batch_size, N1, NE, latent_dim, learning_rate) + return (epochs, batch_size, N1, NE, latent_dim, learning_rate) + def expand(Vs, fr, to, soFar): soFarNew = [] for s in soFar: for v in Vs[fr]: - if s == '': - soFarNew += [str(v)] - else: - soFarNew += [s+','+str(v)] - if fr==to: - return(soFarNew) + if s == '': + soFarNew += [str(v)] + else: + soFarNew += [s + ',' + str(v)] + if fr == to: + return (soFarNew) else: - return expand(Vs, fr+1, to, soFarNew) + return expand(Vs, fr + 1, to, soFarNew) + # ===== Main program ======================================================== if (len(sys.argv) < 3): - print('requires arg1=settingsFilename and arg2=paramsFilename') - sys.exit(1) + print('requires arg1=settingsFilename and arg2=paramsFilename') + sys.exit(1) settingsFilename = sys.argv[1] -paramsFilename = sys.argv[2] +paramsFilename = sys.argv[2] -epochs, batch_size, N1, NE, latent_dim, learning_rate = loadSettings(settingsFilename) +epochs, batch_size, N1, NE, latent_dim, learning_rate = loadSettings( + settingsFilename) -values = {1:epochs, 2: batch_size, 3: N1, 4: NE, 5: latent_dim, 6: learning_rate} +values = { + 1: epochs, + 2: batch_size, + 3: N1, + 4: NE, + 5: latent_dim, + 6: learning_rate +} print values results = expand(values, 1, len(values), ['']) result = ':'.join(results) with open(paramsFilename, 'w') as the_file: the_file.write(result) - diff --git a/workflows/grid/python/evaluateOne.py b/workflows/grid/python/evaluateOne.py index 00910697..3b823eb6 100644 --- a/workflows/grid/python/evaluateOne.py +++ b/workflows/grid/python/evaluateOne.py @@ -1,48 +1,52 @@ +import json +import os +import socket import sys + import p1b1_runner -import json, os -import socket -if (len(sys.argv) < 3): - print('requires arg1=param and arg2=filename') - sys.exit(1) +if len(sys.argv) < 3: + print("requires arg1=param and arg2=filename") + sys.exit(1) parameterString = sys.argv[1] -filename = sys.argv[2] +filename = sys.argv[2] # print (parameterString) -print ("filename is " + filename) -print (socket.gethostname()) - -#List of hyperparameters - edit this to add or remove a parameter -epochs, batch_size, d1, d2, ld, lr = parameterString.split(',') - -hyper_parameter_map = {'epochs' : int(epochs)} -hyper_parameter_map['framework'] = 'keras' -hyper_parameter_map['batch_size'] = int(batch_size) -hyper_parameter_map['dense'] = [int(d1), int(d2)] -hyper_parameter_map['latent_dim'] = int(ld) -hyper_parameter_map['learning_rate'] = float(lr) - -hyper_parameter_map['run_id'] = parameterString -# hyper_parameter_map['instance_directory'] = os.environ['TURBINE_OUTPUT'] -hyper_parameter_map['save'] = os.environ['TURBINE_OUTPUT']+ "/output-"+os.environ['PMI_RANK'] -sys.argv = ['p1b1_runner'] +print("filename is " + filename) +print(socket.gethostname()) + +# List of hyperparameters - edit this to add or remove a parameter +epochs, batch_size, d1, d2, ld, lr = parameterString.split(",") + +hyper_parameter_map = {"epochs": int(epochs)} +hyper_parameter_map["framework"] = "keras" +hyper_parameter_map["batch_size"] = int(batch_size) +hyper_parameter_map["dense"] = [int(d1), int(d2)] +hyper_parameter_map["latent_dim"] = int(ld) +hyper_parameter_map["learning_rate"] = float(lr) + +hyper_parameter_map["run_id"] = parameterString +# hyper_parameter_map['instance_directory'] = os.environ['TURBINE_OUTPUT'] +hyper_parameter_map["save"] = (os.environ["TURBINE_OUTPUT"] + "/output-" + + os.environ["PMI_RANK"]) +sys.argv = ["p1b1_runner"] val_loss = p1b1_runner.run(hyper_parameter_map) -print (val_loss) +print(val_loss) -sfn = os.environ['TURBINE_OUTPUT']+ "/output-"+os.environ['PMI_RANK'] + "/procname-" + parameterString -with open(sfn, 'w') as sfile: +sfn = (os.environ["TURBINE_OUTPUT"] + "/output-" + os.environ["PMI_RANK"] + + "/procname-" + parameterString) +with open(sfn, "w") as sfile: sfile.write(socket.getfqdn()) - proc_id = "-"+ str(os.getpid()) + proc_id = "-" + str(os.getpid()) sfile.write(proc_id) # works around this error: # https://github.com/tensorflow/tensorflow/issues/3388 from keras import backend as K + K.clear_session() # writing the val loss to the output file (result-*) -with open(filename, 'w') as the_file: +with open(filename, "w") as the_file: the_file.write(repr(val_loss)) - diff --git a/workflows/grid/python/p1b1_runner.py b/workflows/grid/python/p1b1_runner.py index 7ceb0c59..ddb43b10 100644 --- a/workflows/grid/python/p1b1_runner.py +++ b/workflows/grid/python/p1b1_runner.py @@ -1,24 +1,30 @@ # tensoflow.__init__ calls _os.path.basename(_sys.argv[0]) # so we need to create a synthetic argv. import sys -if not hasattr(sys, 'argv'): - sys.argv = ['p1b1'] + +if not hasattr(sys, "argv"): + sys.argv = ["p1b1"] import json import os + import p1b1 import runner_utils + def run(hyper_parameter_map): - framework = hyper_parameter_map['framework'] - if framework is 'keras': + framework = hyper_parameter_map["framework"] + if framework is "keras": import p1b1_baseline_keras2 + pkg = p1b1_baseline_keras2 - elif framework is 'mxnet': + elif framework is "mxnet": import p1b1_baseline_mxnet + pkg = p1b1_baseline_mxnet - elif framework is 'neon': + elif framework is "neon": import p1b1_baseline_neon + pkg = p1b1_baseline_neon else: raise ValueError("Invalid framework: {}".format(framework)) @@ -27,23 +33,24 @@ def run(hyper_parameter_map): params = pkg.initialize_parameters() runner_utils.format_params(hyper_parameter_map) - for k,v in hyper_parameter_map.items(): - #if not k in params: + for k, v in hyper_parameter_map.items(): + # if not k in params: # raise Exception("Parameter '{}' not found in set of valid arguments".format(k)) params[k] = v print(params) history = pkg.run(params) - if framework is 'keras': + if framework is "keras": # works around this error: # https://github.com/tensorflow/tensorflow/issues/3388 try: from keras import backend as K + K.clear_session() - except AttributeError: # theano does not have this function + except AttributeError: # theano does not have this function pass # use the last validation_loss as the value to minimize - val_loss = history.history['val_loss'] + val_loss = history.history["val_loss"] return val_loss[-1] diff --git a/workflows/grid/swift/grid-sweep.swift b/workflows/grid/swift/grid-sweep.swift index 1344660a..78aceb5b 100644 --- a/workflows/grid/swift/grid-sweep.swift +++ b/workflows/grid/swift/grid-sweep.swift @@ -62,4 +62,3 @@ file tmp = write(repr(results)); //trace("Temporary filename is: " + filename(tmp)); computeStats(filename(tmp)); - diff --git a/workflows/grid/swift/workflow.sh b/workflows/grid/swift/workflow.sh index 27d2e909..3d153a7a 100755 --- a/workflows/grid/swift/workflow.sh +++ b/workflows/grid/swift/workflow.sh @@ -93,7 +93,7 @@ CMD_LINE_ARGS=( -param_set_file=$PARAM_SET_FILE -ds=$DESIGN_SIZE -pp=$PROPOSE_POINTS -it=$MAX_ITERATIONS - -settings=$EMEWS_PROJECT_ROOT/data/settings.json + -settings=$EMEWS_PROJECT_ROOT/data/settings.json -exp_id=$EXPID -benchmark_timeout=$BENCHMARK_TIMEOUT -site=$SITE @@ -115,7 +115,7 @@ mkdir -pv $TURBINE_OUTPUT/run # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh diff --git a/workflows/grid/test/cfg-prm-1.sh b/workflows/grid/test/cfg-prm-1.sh index e7698292..49db0d0a 100644 --- a/workflows/grid/test/cfg-prm-1.sh +++ b/workflows/grid/test/cfg-prm-1.sh @@ -13,4 +13,3 @@ MAX_BUDGET=${MAX_BUDGET:-1800} DESIGN_SIZE=${DESIGN_SIZE:-2} PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/parameter_set.R} MODEL_NAME="p1b1" - diff --git a/workflows/grid/test/cfg-sys-1.sh b/workflows/grid/test/cfg-sys-1.sh index 6e48105f..b0afa605 100644 --- a/workflows/grid/test/cfg-sys-1.sh +++ b/workflows/grid/test/cfg-sys-1.sh @@ -18,4 +18,3 @@ export WALLTIME=${WALLTIME:-01:33:00} # Benchmark run timeout: benchmark run will timeouT # after the specified number of seconds. -1 is no timeout. BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} - diff --git a/workflows/mlrMBO/.gitignore b/workflows/mlrMBO/.gitignore index 8b137891..e69de29b 100644 --- a/workflows/mlrMBO/.gitignore +++ b/workflows/mlrMBO/.gitignore @@ -1 +0,0 @@ - diff --git a/workflows/mlrMBO/README.md b/workflows/mlrMBO/README.md index f3d36ec0..14768da6 100644 --- a/workflows/mlrMBO/README.md +++ b/workflows/mlrMBO/README.md @@ -2,56 +2,57 @@ mlrMBO is an iterative optimizer written in R. It evaluates the best values of hyperparameters for CANDLE "Benchmarks" available here: `git@github.com:ECP-CANDLE/Benchmarks.git` - given set of parameters. -## Running ## +## Running -1. cd into the *~/Supervisor/workflows/mlrMBO/test* directory -2. Specify the MODEL_NAME in *test-1.sh* file, hyperparameters in *cfg-prm-1.txt* -3. Specify the #procs, queue etc. in *cfg-sys-1.sh* file -4. Launch the test by invoking *./upf-1.sh * - where machine_name can be cori, theta, titan etc. +1. cd into the _~/Supervisor/workflows/mlrMBO/test_ directory +2. Specify the MODEL*NAME in \_test-1.sh* file, hyperparameters in _cfg-prm-1.txt_ +3. Specify the #procs, queue etc. in _cfg-sys-1.sh_ file +4. Launch the test by invoking _./upf-1.sh _ + where machine_name can be cori, theta, titan etc. 5. The benchmark will be run for the number of processors specified 6. Final objective function value will be available in the experiments directory and also printed - -## User requirements ## +## User requirements What you need to install to run the workflow: -* This workflow - `git@github.com:ECP-CANDLE/Supervisor.git` . +- This workflow - `git@github.com:ECP-CANDLE/Supervisor.git` . Clone and `cd` to `workflows/nt3_mlrMBO` (the directory containing this README). -* NT3 benchmark - `git@github.com:ECP-CANDLE/Benchmarks.git` . +- NT3 benchmark - `git@github.com:ECP-CANDLE/Benchmarks.git` . Clone and switch to the `frameworks` branch. -* benchmark data - - See the individual benchmarks README for obtaining the initial data +- benchmark data - + See the individual benchmarks README for obtaining the initial data -## Calling sequence ## +## Calling sequence Function calls :- -* test-1.sh -> swift/workflow.sh -> swift/workflow.swift -> -common/swift/obj_app.swift -> common/sh/model.sh -> -common/python/model_runner.py -> 'calls the benchmark' + +- test-1.sh -> swift/workflow.sh -> swift/workflow.swift -> + common/swift/obj_app.swift -> common/sh/model.sh -> + common/python/model_runner.py -> 'calls the benchmark' Scheduling scripts :- -* upf-1.sh -> cfg-sys-1.sh -> common/sh/ - module, scheduling, langs .sh files -## Making Changes ## +- upf-1.sh -> cfg-sys-1.sh -> common/sh/ - module, scheduling, langs .sh files + +## Making Changes -### Structure ### +### Structure -The point of the script structure is that it is easy to make copy and modify the `test-\*.sh` script, and the `cfg-\*.sh` scripts. These can be checked back into the repo for use by others. The `test-\*.sh` script and the `cfg-\*.sh` scripts should simply contain environment variables that control how `workflow.sh` and `workflow.swift` operate. +The point of the script structure is that it is easy to make copy and modify the `test-\*.sh` script, and the `cfg-\*.sh` scripts. These can be checked back into the repo for use by others. The `test-\*.sh` script and the `cfg-\*.sh` scripts should simply contain environment variables that control how `workflow.sh` and `workflow.swift` operate. `test-1` and `cfg-{sys,prm}-1` should be unmodified for simple testing. -### Calling a different objective function ### +### Calling a different objective function To call a different objective function: 1. Copy `common/swift/obj_app.swift` to a new directory and/or file name. 2. Edit the `app` function body to run your code and return the result. 3. Edit a `test-\*.sh` script to set environment variables: - * `OBJ_DIR`: Set this to the new directory (If changed. Otherwise, `OBJ_DIR` defaults to the absolute path to common/swift .) - * `OBJ_MODULE`: Set this to the Swift file name without suffix (If changed. Otherwise, `OBJ_MODULE` defaults to `obj_app` .) + - `OBJ_DIR`: Set this to the new directory (If changed. Otherwise, `OBJ_DIR` defaults to the absolute path to common/swift .) + - `OBJ_MODULE`: Set this to the Swift file name without suffix (If changed. Otherwise, `OBJ_MODULE` defaults to `obj_app` .) 4. Run it! Simple test for changing objective function: @@ -66,16 +67,16 @@ Swift: Assertion failed!: test-obj-fail.swift was successfully invoked! ... ``` -This indicates that the code in `test_obj_fail.swift` was executed instead of `obj_app.swift` . +This indicates that the code in `test_obj_fail.swift` was executed instead of `obj_app.swift` . -### Where to check for output ### +### Where to check for output This includes error output. -When you run the test script, you will get a message about `TURBINE_OUTPUT` . This will be the main output directory for your run. +When you run the test script, you will get a message about `TURBINE_OUTPUT` . This will be the main output directory for your run. -* On a local system, stdout/stderr for the workflow will go to your terminal. -* On a scheduled system, stdout/stderr for the workflow will go to `TURBINE_OUTPUT/output.txt` +- On a local system, stdout/stderr for the workflow will go to your terminal. +- On a scheduled system, stdout/stderr for the workflow will go to `TURBINE_OUTPUT/output.txt` The individual objective function (model) runs stdout/stderr go into directories of the form: diff --git a/workflows/mlrMBO/data/adrp_nightly.R b/workflows/mlrMBO/data/adrp_nightly.R index 9ffc4f9d..8a99d9e3 100644 --- a/workflows/mlrMBO/data/adrp_nightly.R +++ b/workflows/mlrMBO/data/adrp_nightly.R @@ -1,7 +1,8 @@ param.set <- makeParamSet( - makeIntegerParam("epochs", lower = 2, upper = 2), + makeIntegerParam("epochs", lower = 90, upper = 90), makeNumericParam("dropout", lower = 0.1, upper = 0.2), - makeNumericParam("learning_rate", lower = 0.00001, upper = 0.001) - ## makeDiscreteParam("conv", values = c("32 20 16 32 10 1")) + makeNumericParam("learning_rate", lower = 0.00001, upper = 0.001), + makeDiscreteParam("activation", values = c("elu", "linear", "relu", "sigmoid", "tanh")), + makeDiscreteParam("optimizer", values = c("adam", "sgd", "rmsprop")), + makeDiscreteParam("dense", values = c("500 250 125 60 30", "250 125 60 30", "400 150 75 30","300 175 90 45 20","400 200 100 50 25", "350 170 85 40 20")) ) - diff --git a/workflows/mlrMBO/data/combo_hps_exp_01.R b/workflows/mlrMBO/data/combo_hps_exp_01.R index 2088b353..e97b062d 100644 --- a/workflows/mlrMBO/data/combo_hps_exp_01.R +++ b/workflows/mlrMBO/data/combo_hps_exp_01.R @@ -6,12 +6,12 @@ param.set <- makeParamSet( - + makeDiscreteParam("cell_features", values=c("mirna", "expression")), # use a subset of 978 landmark features only to speed up training makeDiscreteParam("use_landmark_genes", values=c(1)), - + # use consecutive 1000-neuron layers to facilitate residual connections makeDiscreteParam("dense", values=c("1000", @@ -32,19 +32,16 @@ param.set <- makeParamSet( makeIntegerParam("batch_size", lower=5, upper=10, trafo = function(x) 2L^x), makeDiscreteParam("residual", values=c(1, 0)), - + makeDiscreteParam("activation", values=c("relu", "sigmoid", "tanh")), makeDiscreteParam("optimizer", values=c("adam", "sgd", "rmsprop")), - + makeNumericParam("learning_rate", lower=0.00001, upper=0.1), - + makeDiscreteParam("reduce_lr", values=c(1, 0)), - + makeDiscreteParam("warmup_lr", values=c(1, 0)), - + makeIntegerParam("epochs", lower=5, upper=1000) ) - - - diff --git a/workflows/mlrMBO/data/combo_nightly.R b/workflows/mlrMBO/data/combo_nightly.R index 9e3effce..c140ba1a 100644 --- a/workflows/mlrMBO/data/combo_nightly.R +++ b/workflows/mlrMBO/data/combo_nightly.R @@ -6,20 +6,18 @@ param.set <- makeParamSet( - + makeDiscreteParam("cell_features", values=c("expression")), # use a subset of 978 landmark features only to speed up training makeDiscreteParam("use_landmark_genes", values=c(1)), makeDiscreteParam("residual", values=c(1, 0)), - - + + makeDiscreteParam("reduce_lr", values=c(1, 0)), - + makeDiscreteParam("warmup_lr", values=c(1, 0)), - + makeIntegerParam("epochs", lower=1, upper=3) ) - - diff --git a/workflows/mlrMBO/data/dummy_nightly.R b/workflows/mlrMBO/data/dummy_nightly.R new file mode 100644 index 00000000..797c4e5b --- /dev/null +++ b/workflows/mlrMBO/data/dummy_nightly.R @@ -0,0 +1,20 @@ + +# NT3 Hyperparameter Search - Test 1 +# These parameters should stay small for short tests +# and use no dense parameters to avoid mlrMBO crashes + +# see https://cran.r-project.org/web/packages/ParamHelpers/ParamHelpers.pdfmakeNum +# the parameter names should match names of the arguments expected by the benchmark + +param.set <- makeParamSet( + makeDiscreteParam("test_batch", values = c(8, 16)), + # makeIntegerParam("epochs", lower = 1, upper = 1), +# makeDiscreteParam("activation", values = c("softmax", "elu", "softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear")), +# makeDiscreteParam("dense", values = c("500 100 50", "1000 500 100 50", "2000 1000 500 100 50", "2000 1000 1000 500 100 50", "2000 1000 1000 1000 500 100 50")), + # makeDiscreteParam("optimizer", values = c("adam", "sgd", "rmsprop", "adagrad", "adadelta")), + # makeNumericParam("dropout", lower = 0, upper = 0.9), + makeNumericParam("learning_rate", lower = 0.001, upper = 0.1) +# makeDiscreteParam("conv", values = c("50 50 50 50 50 1", "25 25 25 25 25 1", "64 32 16 32 64 1", "100 100 100 100 100 1", "32 20 16 32 10 1")) + ## DEBUG PARAMETERS: DON'T USE THESE IN PRODUCTION RUN + ## makeDiscreteParam("conv", values = c("32 20 16 32 10 1")) +) diff --git a/workflows/mlrMBO/data/graphdrp.R b/workflows/mlrMBO/data/graphdrp.R new file mode 100644 index 00000000..e496a74c --- /dev/null +++ b/workflows/mlrMBO/data/graphdrp.R @@ -0,0 +1,13 @@ + +# GraphDRP Hyperparameter Search - Test "small" +# These parameters should stay small for short tests +# and use no dense parameters to avoid mlrMBO crashes + +# see https://cran.r-project.org/web/packages/ParamHelpers/ParamHelpers.pdfmakeNum +# the parameter names should match names of the arguments expected by the benchmark + +param.set <- makeParamSet( + # makeIntegerParam("epochs", lower = 3, upper = 4), + makeIntegerParam("batch_size" , lower = 32 , upper = 2048 ), + makeNumericParam("learning_rate", lower = 0.000001, upper = 0.1) +) diff --git a/workflows/mlrMBO/data/graphdrp_small.R b/workflows/mlrMBO/data/graphdrp_small.R new file mode 100644 index 00000000..b3e01509 --- /dev/null +++ b/workflows/mlrMBO/data/graphdrp_small.R @@ -0,0 +1,15 @@ + +# GraphDRP Hyperparameter Search - Test "small" +# These parameters should stay small for short tests +# and use no dense parameters to avoid mlrMBO crashes + +# see https://cran.r-project.org/web/packages/ParamHelpers/ParamHelpers.pdfmakeNum +# the parameter names should match names of the arguments expected by the benchmark + +param.set <- makeParamSet( +# makeDiscreteParam("test_batch", values = c(8, 16)), + makeIntegerParam("epochs", lower = 1, upper = 6), + # makeDiscreteParam("optimizer", values = c("adam", "sgd", "rmsprop", "adagrad", "adadelta")), + makeNumericParam("dropout", lower = 0.1, upper = 0.5), + makeNumericParam("learning_rate", lower = 0.001, upper = 0.5) +) diff --git a/workflows/mlrMBO/data/nt3_nightly.R b/workflows/mlrMBO/data/nt3_nightly.R index 7fc9fa31..8124fc2d 100644 --- a/workflows/mlrMBO/data/nt3_nightly.R +++ b/workflows/mlrMBO/data/nt3_nightly.R @@ -11,7 +11,7 @@ param.set <- makeParamSet( makeIntegerParam("epochs", lower = 2, upper = 5), # makeDiscreteParam("activation", values = c("softmax", "elu", "softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear")), # makeDiscreteParam("dense", values = c("500 100 50", "1000 500 100 50", "2000 1000 500 100 50", "2000 1000 1000 500 100 50", "2000 1000 1000 1000 500 100 50")), - makeDiscreteParam("optimizer", values = c("adam", "sgd", "rmsprop", "adagrad", "adadelta")), + makeDiscreteParam("optimizer", values = c("adam", "sgd")), makeNumericParam("dropout", lower = 0, upper = 0.9), makeNumericParam("learning_rate", lower = 0.00001, upper = 0.1) # makeDiscreteParam("conv", values = c("50 50 50 50 50 1", "25 25 25 25 25 1", "64 32 16 32 64 1", "100 100 100 100 100 1", "32 20 16 32 10 1")) diff --git a/workflows/mlrMBO/data/oned.R b/workflows/mlrMBO/data/oned.R new file mode 100644 index 00000000..1a897bf7 --- /dev/null +++ b/workflows/mlrMBO/data/oned.R @@ -0,0 +1,14 @@ + +# NT3 Hyperparameter Search - Test 1 +# These parameters should stay small for short tests +# and use no dense parameters to avoid mlrMBO crashes + +# see https://cran.r-project.org/web/packages/ParamHelpers/ParamHelpers.pdfmakeNum +# the parameter names should match names of the arguments expected by the benchmark + +param.set <- makeParamSet( + makeNumericParam("x", lower = 1, upper = 20) +# makeDiscreteParam("conv", values = c("50 50 50 50 50 1", "25 25 25 25 25 1", "64 32 16 32 64 1", "100 100 100 100 100 1", "32 20 16 32 10 1")) + ## DEBUG PARAMETERS: DON'T USE THESE IN PRODUCTION RUN + ## makeDiscreteParam("conv", values = c("32 20 16 32 10 1")) +) diff --git a/workflows/mlrMBO/data/p1_gdrp.R b/workflows/mlrMBO/data/p1_gdrp.R new file mode 100644 index 00000000..e311f4b6 --- /dev/null +++ b/workflows/mlrMBO/data/p1_gdrp.R @@ -0,0 +1,16 @@ + +# GraphDRP Hyperparameter Search - Test "small" +# These parameters should stay small for short tests +# and use no dense parameters to avoid mlrMBO crashes + +# see https://cran.r-project.org/web/packages/ParamHelpers/ParamHelpers.pdfmakeNum +# the parameter names should match names of the arguments expected by the benchmark + +param.set <- makeParamSet( + #makeDiscreteParam("test_batch", values = c(8, 17)), + makeDiscreteParam("batch_size", values = c(8, 256)), + makeIntegerParam("epochs", lower = 100, upper = 101), + makeDiscreteParam("optimizer", values = c("adam", "sgd")), + makeNumericParam("dropout", lower = 0, upper = 0.9), + makeNumericParam("learning_rate", lower = 0.001, upper = 0.1) +) diff --git a/workflows/mlrMBO/data/p1b1_hps_exp_01.R b/workflows/mlrMBO/data/p1b1_hps_exp_01.R index 62f6ee7e..0da5e977 100644 --- a/workflows/mlrMBO/data/p1b1_hps_exp_01.R +++ b/workflows/mlrMBO/data/p1b1_hps_exp_01.R @@ -8,17 +8,17 @@ param.set <- makeParamSet( # we optimize for ae and vae separately makeDiscreteParam("model", values=c("ae")), - + # makeDiscreteParam("latent_dim", values=c(2, 8, 32, 128, 512)), makeIntegerParam("latent_dim", lower=1, upper=9, trafo = function(x) 2L^x), # use a subset of 978 landmark features only to speed up training makeDiscreteParam("use_landmark_genes", values=c(0)), - + # large batch_size only makes sense when warmup_lr is on #makeDiscreteParam("batch_size", values=c(32, 64, 128, 256, 512, 1024)), makeIntegerParam("batch_size", lower=5, upper=10, trafo = function(x) 2L^x), - + # use consecutive 978-neuron layers to facilitate residual connections makeDiscreteParam("dense", values=c("1500 500", "978 978", @@ -26,22 +26,20 @@ param.set <- makeParamSet( "978 978 978 978", "978 978 978 978 978", "978 978 978 978 978 978")), - + makeDiscreteParam("residual", values=c(1, 0)), - + makeDiscreteParam("activation", values=c("relu", "sigmoid", "tanh")), - + makeDiscreteParam("optimizer", values=c("adam", "sgd")), - + makeNumericParam("learning_rate", lower=0.00001, upper=0.1), - + makeDiscreteParam("reduce_lr", values=c(1, 0)), - + makeDiscreteParam("warmup_lr", values=c(1, 0)), - + makeNumericParam("dropout", lower=0, upper=0.9), - + makeIntegerParam("epochs", lower=5, upper=1000) ) - - diff --git a/workflows/mlrMBO/data/p1b1_nightly.R b/workflows/mlrMBO/data/p1b1_nightly.R index 0ca68ca0..6b98b406 100644 --- a/workflows/mlrMBO/data/p1b1_nightly.R +++ b/workflows/mlrMBO/data/p1b1_nightly.R @@ -8,15 +8,15 @@ param.set <- makeParamSet( # we optimize for ae and vae separately makeDiscreteParam("model", values=c("ae")), - + # makeDiscreteParam("latent_dim", values=c(2, 8, 32, 128, 512)), makeIntegerParam("latent_dim", lower=1, upper=9, trafo = function(x) 2L^x), # use a subset of 978 landmark features only to speed up training makeDiscreteParam("use_landmark_genes", values=c(0)), - - + + # use consecutive 978-neuron layers to facilitate residual connections # makeDiscreteParam("dense", values=c("1500 500", # "978 978", @@ -24,22 +24,20 @@ param.set <- makeParamSet( # "978 978 978 978", # "978 978 978 978 978", # "978 978 978 978 978 978")), - + makeDiscreteParam("residual", values=c(1, 0)), - + makeDiscreteParam("activation", values=c("relu", "sigmoid", "tanh")), - + makeDiscreteParam("optimizer", values=c("adam", "sgd")), - + makeNumericParam("learning_rate", lower=0.00001, upper=0.1), - + makeDiscreteParam("reduce_lr", values=c(1, 0)), - + makeDiscreteParam("warmup_lr", values=c(1, 0)), - + makeNumericParam("dropout", lower=0, upper=0.9), - + makeIntegerParam("epochs", lower=2, upper=3) ) - - diff --git a/workflows/mlrMBO/data/sct_nightly.R b/workflows/mlrMBO/data/sct_nightly.R new file mode 100644 index 00000000..93239e08 --- /dev/null +++ b/workflows/mlrMBO/data/sct_nightly.R @@ -0,0 +1,20 @@ + +# NT3 Hyperparameter Search - Test 1 +# These parameters should stay small for short tests +# and use no dense parameters to avoid mlrMBO crashes + +# see https://cran.r-project.org/web/packages/ParamHelpers/ParamHelpers.pdfmakeNum +# the parameter names should match names of the arguments expected by the benchmark + +param.set <- makeParamSet( + makeDiscreteParam("batch_size", values = c(16, 32)), + makeIntegerParam("epochs", lower = 1, upper = 1), +# makeDiscreteParam("activation", values = c("softmax", "elu", "softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear")), +# makeDiscreteParam("dense", values = c("500 100 50", "1000 500 100 50", "2000 1000 500 100 50", "2000 1000 1000 500 100 50", "2000 1000 1000 1000 500 100 50")), + makeDiscreteParam("optimizer", values = c("adam", "sgd", "rmsprop", "adagrad", "adadelta")), + makeNumericParam("dropout", lower = 0, upper = 0.9), + makeNumericParam("learning_rate", lower = 0.00001, upper = 0.1) +# makeDiscreteParam("conv", values = c("50 50 50 50 50 1", "25 25 25 25 25 1", "64 32 16 32 64 1", "100 100 100 100 100 1", "32 20 16 32 10 1")) + ## DEBUG PARAMETERS: DON'T USE THESE IN PRODUCTION RUN + ## makeDiscreteParam("conv", values = c("32 20 16 32 10 1")) +) diff --git a/workflows/mlrMBO/swift/workflow.sh b/workflows/mlrMBO/swift/workflow.sh index cf02242a..50b749fe 100755 --- a/workflows/mlrMBO/swift/workflow.sh +++ b/workflows/mlrMBO/swift/workflow.sh @@ -8,70 +8,65 @@ set -eu # Autodetect this workflow directory export EMEWS_PROJECT_ROOT=$( cd $( dirname $0 )/.. ; /bin/pwd ) export WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. ; /bin/pwd ) -if [[ ! -d $EMEWS_PROJECT_ROOT/../../../Benchmarks ]] -then - echo "Could not find Benchmarks in: $EMEWS_PROJECT_ROOT/../../../Benchmarks" - exit 1 -fi -BENCHMARKS_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd ) -export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} -BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/P1B1 -# $BENCHMARKS_ROOT/Pilot1/Attn1:$BENCHMARKS_ROOT/Pilot1/TC1: -# :$BENCHMARKS_ROOT/Pilot1/P1B1:$BENCHMARKS_ROOT/Pilot1/Combo:$BENCHMARKS_ROOT/Pilot2/P2B1:$BENCHMARKS_ROOT/Pilot3/P3B1:$BENCHMARKS_ROOT/Pilot3/P3B3:$BENCHMARKS_ROOT/Pilot3/P3B4 export BENCHMARK_TIMEOUT -export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} SCRIPT_NAME=$(basename $0) # Source some utility functions used by EMEWS in this script source $WORKFLOWS_ROOT/common/sh/utils.sh -#source "${EMEWS_PROJECT_ROOT}/etc/emews_utils.sh" - moved to utils.sh - -# Uncomment to turn on Swift/T logging. Can also set TURBINE_LOG, -# TURBINE_DEBUG, and ADLB_DEBUG to 0 to turn off logging. -# Do not commit with logging enabled, users have run out of disk space -# export TURBINE_LOG=1 TURBINE_DEBUG=1 ADLB_DEBUG=1 - usage() { - echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME" + echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME " \ + "[CANDLE_MODEL_TYPE] [CANDLE_IMAGE]" } -if (( ${#} != 5 )) +if (( ${#} != 7 )) && (( ${#} != 5 )) then usage exit 1 fi -if ! { - get_site $1 # Sets SITE - get_expid $2 # Sets EXPID - get_cfg_sys $3 - get_cfg_prm $4 - MODEL_NAME=$5 - } +if (( ${#} == 7 )) then + export CANDLE_MODEL_TYPE=$6 + export CANDLE_IMAGE=$7 +elif (( ${#} == 5 )) +then + CANDLE_MODEL_TYPE="BENCHMARKS" + CANDLE_IMAGE=NONE +else usage exit 1 fi -echo "Running "$MODEL_NAME "workflow" +TURBINE_OUTPUT="" +if [[ $CANDLE_MODEL_TYPE == "SINGULARITY" ]] +then + TURBINE_OUTPUT=$CANDLE_DATA_DIR/output + printf "Running mlrMBO workflow with model %s and image %s:%s\n" \ + $MODEL_NAME $CANDLE_MODEL_TYPE $CANDLE_IMAGE +fi -# Set PYTHONPATH for BENCHMARK related stuff -PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common +get_site $1 # Sets SITE +get_expid $2 # Sets EXPID +get_cfg_sys $3 +get_cfg_prm $4 +MODEL_NAME=$5 source_site env $SITE source_site sched $SITE -PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # needed for model_runner and logs - if [[ ${EQR:-} == "" ]] then - abort "The site '$SITE' did not set the location of EQ/R: this will not work!" + abort "The site '$SITE' did not set the location of EQ/R: " \ + "this will not work!" fi -export TURBINE_JOBNAME="JOB:${EXPID}" +# Set up PYTHONPATH for model +source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh + +export TURBINE_JOBNAME="MBO_${EXPID}" RESTART_FILE_ARG="" if [[ ${RESTART_FILE:-} != "" ]] @@ -118,8 +113,13 @@ cp $WORKFLOWS_ROOT/common/R/$R_FILE $PARAM_SET_FILE $CFG_SYS $CFG_PRM $TURBINE_O mkdir -pv $TURBINE_OUTPUT/run # Allow the user to set an objective function -OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +if [[ ${CANDLE_MODEL_TYPE:-} == "SINGULARITY" ]] +then + CANDLE_MODEL_IMPL="container" +fi +CANDLE_MODEL_IMPL=${CANDLE_MODEL_IMPL:-container} +SWIFT_LIBS_DIR=${SWIFT_LIBS_DIR:-$WORKFLOWS_ROOT/common/swift} +SWIFT_MODULE=${SWIFT_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function # Andrew: Allows for custom model.sh file, if that's desired export MODEL_SH=${MODEL_SH:-$WORKFLOWS_ROOT/common/sh/model.sh} @@ -131,8 +131,10 @@ then echo "Turbine will wait for job completion." fi -# Use for Summit (LSF needs two %) -if [[ ${SITE:-} == "summit" ]] +# Handle %-escapes in TURBINE_STDOUT +if [ $SITE == "summit" ] || \ + [ $SITE == "biowulf" ] || \ + [ $SITE == "polaris" ] then export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" else @@ -141,8 +143,6 @@ fi mkdir -pv $TURBINE_OUTPUT/out -#swift-t -n $PROCS \ -# -o $TURBINE_OUTPUT/workflow.tic \ if [[ ${MACHINE:-} == "" ]] then STDOUT=$TURBINE_OUTPUT/output.txt @@ -158,19 +158,33 @@ else STDOUT="" fi +if [[ ${CANDLE_DATA_DIR:-} == "" ]] +then + echo "CANDLE_DATA_DIR is not set in the environment! Exiting..." + exit 1 +fi + +# We use 'swift-t -o' to allow swift-t to prevent scheduler errors +# on Biowulf. Reported by ALW 2021-01-21 + +( + PY_ENVS=$( python_envs ) +set -x swift-t -O 0 -n $PROCS \ + -o $TURBINE_OUTPUT/workflow.tic \ ${MACHINE:-} \ -p -I $EQR -r $EQR \ - -I $OBJ_DIR \ - -i $OBJ_MODULE \ - -e LD_LIBRARY_PATH=$LD_LIBRARY_PATH \ + -I $SWIFT_LIBS_DIR \ + -i $SWIFT_MODULE \ + -e LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} \ -e TURBINE_RESIDENT_WORK_WORKERS=$TURBINE_RESIDENT_WORK_WORKERS \ -e RESIDENT_WORK_RANKS=$RESIDENT_WORK_RANKS \ + -e APP_PYTHONPATH \ -e BENCHMARKS_ROOT \ -e EMEWS_PROJECT_ROOT \ - $( python_envs ) \ + $PY_ENVS \ -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ - -e OBJ_RETURN \ + -e MODEL_RETURN \ -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ -e MODEL_SH \ @@ -180,10 +194,13 @@ swift-t -O 0 -n $PROCS \ -e SH_TIMEOUT \ -e TURBINE_STDOUT \ -e IGNORE_ERRORS \ + -e CANDLE_DATA_DIR \ + -e CANDLE_MODEL_TYPE \ + -e CANDLE_IMAGE \ $WAIT_ARG \ - $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} |& \ - tee $STDOUT - + $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} ) +# 2>&1 | \ +# tee $STDOUT if (( ${PIPESTATUS[0]} )) then @@ -191,7 +208,4 @@ then exit 1 fi -# echo "EXIT CODE: 0" | tee -a $STDOUT - -# Andrew: Needed this so that script to monitor job worked properly (queue_wait... function in utils.sh?) -echo $TURBINE_OUTPUT > turbine-directory.txt +echo "EXIT CODE: 0" | tee -a $STDOUT diff --git a/workflows/mlrMBO/swift/workflow.swift b/workflows/mlrMBO/swift/workflow.swift index 1f9936d6..161a1bdf 100644 --- a/workflows/mlrMBO/swift/workflow.swift +++ b/workflows/mlrMBO/swift/workflow.swift @@ -14,18 +14,19 @@ import EQR; import R; import assert; import python; -/* Helper for reporting environment variables common/swift/candle_utils.swift -* import candle_utils; -* -* report_env(); -*/ + +// Helper for reporting environment variables for debugging +// Cf. common/swift/candle_utils.swift +// This can be removed as desired. +import candle_utils; +report_env(); string emews_root = getenv("EMEWS_PROJECT_ROOT"); string turbine_output = getenv("TURBINE_OUTPUT"); string resident_work_ranks = getenv("RESIDENT_WORK_RANKS"); string r_ranks[] = split(resident_work_ranks,","); int propose_points = toint(argv("pp", "3")); -int max_budget = toint(argv("mb", "110")); +int max_budget = toint(argv("mb", "1000")); int max_iterations = toint(argv("it", "5")); int design_size = toint(argv("ds", "10")); string param_set = argv("param_set_file"); @@ -33,14 +34,15 @@ string exp_id = argv("exp_id"); int benchmark_timeout = toint(argv("benchmark_timeout", "-1")); string restart_file = argv("restart_file", "DISABLED"); string r_file = argv("r_file", "mlrMBO1.R"); - string model_name = getenv("MODEL_NAME"); +string candle_model_type = getenv("CANDLE_MODEL_TYPE"); +string candle_image = getenv("CANDLE_IMAGE"); +string init_params_file = getenv("INIT_PARAMS_FILE"); + printf("CANDLE mlrMBO Workflow"); printf("TURBINE_OUTPUT: " + turbine_output); - - string restart_number = argv("restart_number", "1"); string site = argv("site"); @@ -76,11 +78,11 @@ string FRAMEWORK = "keras"; } else if (params == "EQR_ABORT") { - printf("EQR aborted: see output for R error") => - string why = EQR_get(ME); - printf("%s", why) => + printf("EQR_ABORT: see output for R error") => + string why = EQR_get(ME); + printf("EQR_ABORT: R exception: %s", why) => // v = propagate(why) => - c = false; + c = false; } else { @@ -88,8 +90,8 @@ string FRAMEWORK = "keras"; string results[]; foreach param, j in param_array { - results[j] = obj(param, - "%00i_%000i_%0000i" % (restart_number,i,j)); + run_id = "%02i_%03i_%04i" % (restart_number,i,j); + results[j] = candle_model_train(param, exp_id, run_id, model_name); } string result = join(results, ";"); // printf(result); @@ -99,7 +101,8 @@ string FRAMEWORK = "keras"; } // These must agree with the arguments to the objective function in mlrMBO.R, -// except param.set.file is removed and processed by the mlrMBO.R algorithm wrapper. +// except param.set.file is removed and processed by the mlrMBO.R +// algorithm wrapper. string algo_params_template = """ param.set.file='%s', diff --git a/workflows/mlrMBO/test-1000-01-mbo/cfg-prm-restart.sh b/workflows/mlrMBO/test-1000-01-mbo/cfg-prm-restart.sh index 9d3afd6f..e1434cf3 100644 --- a/workflows/mlrMBO/test-1000-01-mbo/cfg-prm-restart.sh +++ b/workflows/mlrMBO/test-1000-01-mbo/cfg-prm-restart.sh @@ -2,7 +2,7 @@ # Configuration of parameters: 1 # mlrMBO settings -# How many to runs evaluate per iteration -> +# How many to runs evaluate per iteration -> #Adding the number of restart runs to the budget (9 - for the test case) #This is the minimum number of runs required for restart 9 (greater than 8, which is the design size) MAX_BUDGET=${MAX_BUDGET:-25} @@ -23,9 +23,7 @@ elif [ "$MODEL_NAME" = "p1b3" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b3_hps_exp_01.R} elif [ "$MODEL_NAME" = "p1b2" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b2_hps_exp_01.R} -else +else echo "Invalid model-" $MODEL_NAME exit fi - - diff --git a/workflows/mlrMBO/test-1000-01-mbo/cfg-sys-1.sh b/workflows/mlrMBO/test-1000-01-mbo/cfg-sys-1.sh index e2781a0b..583de2d9 100644 --- a/workflows/mlrMBO/test-1000-01-mbo/cfg-sys-1.sh +++ b/workflows/mlrMBO/test-1000-01-mbo/cfg-sys-1.sh @@ -36,5 +36,3 @@ export SH_TIMEOUT=${SH_TIMEOUT:-2000} # Ignore errors: If 1, unknown errors will be reported to model.log # but will not bring down the Swift workflow. See model.sh . export IGNORE_ERRORS=1 - - diff --git a/workflows/mlrMBO/test-1000-01-mbo/test-restart.sh b/workflows/mlrMBO/test-1000-01-mbo/test-restart.sh index 69b2d3ad..6bd48cf9 100755 --- a/workflows/mlrMBO/test-1000-01-mbo/test-restart.sh +++ b/workflows/mlrMBO/test-1000-01-mbo/test-restart.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eu -echo "Usage test-1.sh BECHMARK_NAME SITE RUN_DIR(optional, -a=automatic)" +echo "Usage test-1.sh BECHMARK_NAME SITE RUN_DIR(optional, -a=automatic)" RUN_DIR="" @@ -14,7 +14,7 @@ then echo "Automatically assigning run directory in ../experiments folder" RUN_DIR="-a" else - echo "Usage test SITE RUN_DIR(optional)" + echo "Usage test SITE RUN_DIR(optional)" exit 1 fi diff --git a/workflows/mlrMBO/test-1000-01-rs/cfg-prm-restart.sh b/workflows/mlrMBO/test-1000-01-rs/cfg-prm-restart.sh index 9d3afd6f..e1434cf3 100644 --- a/workflows/mlrMBO/test-1000-01-rs/cfg-prm-restart.sh +++ b/workflows/mlrMBO/test-1000-01-rs/cfg-prm-restart.sh @@ -2,7 +2,7 @@ # Configuration of parameters: 1 # mlrMBO settings -# How many to runs evaluate per iteration -> +# How many to runs evaluate per iteration -> #Adding the number of restart runs to the budget (9 - for the test case) #This is the minimum number of runs required for restart 9 (greater than 8, which is the design size) MAX_BUDGET=${MAX_BUDGET:-25} @@ -23,9 +23,7 @@ elif [ "$MODEL_NAME" = "p1b3" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b3_hps_exp_01.R} elif [ "$MODEL_NAME" = "p1b2" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b2_hps_exp_01.R} -else +else echo "Invalid model-" $MODEL_NAME exit fi - - diff --git a/workflows/mlrMBO/test-1000-01-rs/cfg-sys-1.sh b/workflows/mlrMBO/test-1000-01-rs/cfg-sys-1.sh index e2781a0b..583de2d9 100644 --- a/workflows/mlrMBO/test-1000-01-rs/cfg-sys-1.sh +++ b/workflows/mlrMBO/test-1000-01-rs/cfg-sys-1.sh @@ -36,5 +36,3 @@ export SH_TIMEOUT=${SH_TIMEOUT:-2000} # Ignore errors: If 1, unknown errors will be reported to model.log # but will not bring down the Swift workflow. See model.sh . export IGNORE_ERRORS=1 - - diff --git a/workflows/mlrMBO/test-1000-01-rs/test-restart.sh b/workflows/mlrMBO/test-1000-01-rs/test-restart.sh index 69b2d3ad..6bd48cf9 100755 --- a/workflows/mlrMBO/test-1000-01-rs/test-restart.sh +++ b/workflows/mlrMBO/test-1000-01-rs/test-restart.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eu -echo "Usage test-1.sh BECHMARK_NAME SITE RUN_DIR(optional, -a=automatic)" +echo "Usage test-1.sh BECHMARK_NAME SITE RUN_DIR(optional, -a=automatic)" RUN_DIR="" @@ -14,7 +14,7 @@ then echo "Automatically assigning run directory in ../experiments folder" RUN_DIR="-a" else - echo "Usage test SITE RUN_DIR(optional)" + echo "Usage test SITE RUN_DIR(optional)" exit 1 fi diff --git a/workflows/mlrMBO/test/big-gdrp.sh b/workflows/mlrMBO/test/big-gdrp.sh new file mode 100755 index 00000000..7b390612 --- /dev/null +++ b/workflows/mlrMBO/test/big-gdrp.sh @@ -0,0 +1,37 @@ +#!/bin/bash +set -eu + +# TEST MLRMBO GDRP 1 +# For GraphDRP + +if (( ${#} != 1 )) +then + echo "usage: test SITE" + exit 1 +fi + +SITE=$1 + +# Self-configure +THIS=$( cd $( dirname $0 ) ; /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. ; /bin/pwd ) +WORKFLOWS_ROOT=$( cd $THIS/../.. ; /bin/pwd ) +export EMEWS_PROJECT_ROOT + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-gdrp.sh +export CFG_PRM=$THIS/cfg-prm-gdrp.sh + +# Specify the mlrMBO algorithm R file +export R_FILE=mlrMBO-mbo.R + +CANDLE_MODEL_TYPE="SINGULARITY" +CANDLE_IMAGE=/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif + +export MODEL_NAME="graphdrp" + +# Currently ignored: +export OBJ_RETURN="val_loss" + +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $CFG_PRM $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE diff --git a/workflows/mlrMBO/test/cfg-prm-1.sh b/workflows/mlrMBO/test/cfg-prm-1.sh index fd46deb7..d8209561 100644 --- a/workflows/mlrMBO/test/cfg-prm-1.sh +++ b/workflows/mlrMBO/test/cfg-prm-1.sh @@ -28,7 +28,7 @@ fi if [[ "${PARAM_SET_FILE:-}" == "" ]]; then # PARAM_SET_FILE must be set before this script returns! - echo "Invalid model-" "'${MODEL_NAME:-}'" + echo "Cannot set PARAM_SET_FILE: unknown model: '${MODEL_NAME:-}'" exit 1 fi set +x diff --git a/workflows/mlrMBO/test/cfg-prm-30.sh b/workflows/mlrMBO/test/cfg-prm-30.sh index c00e55fe..f379e9d8 100644 --- a/workflows/mlrMBO/test/cfg-prm-30.sh +++ b/workflows/mlrMBO/test/cfg-prm-30.sh @@ -19,9 +19,7 @@ elif [ "$MODEL_NAME" = "p1b3" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b3_hps_exp_01.R} elif [ "$MODEL_NAME" = "p1b2" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b2_hps_exp_01.R} -else +else echo "Invalid model-" $MODEL_NAME exit fi - - diff --git a/workflows/mlrMBO/test/cfg-prm-gdrp.sh b/workflows/mlrMBO/test/cfg-prm-gdrp.sh new file mode 100644 index 00000000..79b96369 --- /dev/null +++ b/workflows/mlrMBO/test/cfg-prm-gdrp.sh @@ -0,0 +1,40 @@ +# CFG PRM NIGHTLY + +# mlrMBO settings + +# Total iterations +PROPOSE_POINTS=${PROPOSE_POINTS:-14} +MAX_CONCURRENT_EVALUATIONS=${MAX_CONCURRET_EVALUATIONS:-1} +MAX_ITERATIONS=${MAX_ITERATIONS:-3} +MAX_BUDGET=${MAX_BUDGET:-180} +DESIGN_SIZE=${DESIGN_SIZE:-14} + +# TODO: move the following code to a utility library- +# this is a configuration file +# Set the R data file for running +if [ "$MODEL_NAME" = "combo" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/combo_nightly.R} +elif [ "$MODEL_NAME" = "attn" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/attn_nightly.R} +elif [ "$MODEL_NAME" = "adrp" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/adrp_nightly.R} +elif [ "$MODEL_NAME" = "p1b1" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b1_nightly.R} +elif [ "$MODEL_NAME" = "nt3" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/nt3_nightly.R} +elif [ "$MODEL_NAME" = "p1b3" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b3_nightly.R} +elif [ "$MODEL_NAME" = "p1b2" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b2_nightly.R} +elif [ "$MODEL_NAME" = "p2b1" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p2b1_nightly.R} +elif [ "$MODEL_NAME" = "graphdrp" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1_gdrp.R} +elif [ "$MODEL_NAME" = "dummy" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/dummy_nightly.R} +elif [[ "${PARAM_SET_FILE:-}" != "" ]]; then + PARAM_SET_FILE=${EMEWS_PROJECT_ROOT}/data/${PARAM_SET_FILE} +else + printf "Could not find PARAM_SET_FILE for model: '%s'\n" $MODEL_NAME + exit 1 +fi diff --git a/workflows/mlrMBO/test/cfg-prm-nightly.sh b/workflows/mlrMBO/test/cfg-prm-nightly.sh index 02b79fe0..9678314a 100644 --- a/workflows/mlrMBO/test/cfg-prm-nightly.sh +++ b/workflows/mlrMBO/test/cfg-prm-nightly.sh @@ -1,13 +1,13 @@ -# CFG PRM 1 +# CFG PRM NIGHTLY # mlrMBO settings # Total iterations -PROPOSE_POINTS=${PROPOSE_POINTS:-25} +PROPOSE_POINTS=${PROPOSE_POINTS:-5} MAX_CONCURRENT_EVALUATIONS=${MAX_CONCURRET_EVALUATIONS:-1} MAX_ITERATIONS=${MAX_ITERATIONS:-3} MAX_BUDGET=${MAX_BUDGET:-180} -DESIGN_SIZE=${DESIGN_SIZE:-15} +DESIGN_SIZE=${DESIGN_SIZE:-5} # TODO: move the following code to a utility library- # this is a configuration file @@ -28,11 +28,16 @@ elif [ "$MODEL_NAME" = "p1b2" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b2_nightly.R} elif [ "$MODEL_NAME" = "p2b1" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p2b1_nightly.R} +elif [ "$MODEL_NAME" = "graphdrp" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/graphdrp_small.R} elif [ "$MODEL_NAME" = "dummy" ]; then - PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/nt3_nightly.R} -elif [ "$PARAM_SET_FILE" != "" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/dummy_nightly.R} +elif [ "$MODEL_NAME" = "oned" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/oned.R} +elif [[ "${PARAM_SET_FILE:-}" != "" ]]; then PARAM_SET_FILE=${EMEWS_PROJECT_ROOT}/data/${PARAM_SET_FILE} else - echo "Invalid model-" $MODEL_NAME - exit 1 + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/graphdrp_small.R} +# printf "Could not find PARAM_SET_FILE for model: '%s'\n" $MODEL_NAME +# exit 1 fi diff --git a/workflows/mlrMBO/test/cfg-prm-restart.sh b/workflows/mlrMBO/test/cfg-prm-restart.sh index 13c19339..a7581e5b 100644 --- a/workflows/mlrMBO/test/cfg-prm-restart.sh +++ b/workflows/mlrMBO/test/cfg-prm-restart.sh @@ -2,7 +2,7 @@ # Configuration of parameters: 1 # mlrMBO settings -# How many to runs evaluate per iteration -> +# How many to runs evaluate per iteration -> #Adding the number of restart runs to the budget (9 - for the test case) #This is the minimum number of runs required for restart 9 (greater than 8, which is the design size) MAX_BUDGET=${MAX_BUDGET:-25} @@ -24,9 +24,7 @@ elif [ "$MODEL_NAME" = "p1b3" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b3_hps_exp_01.R} elif [ "$MODEL_NAME" = "p1b2" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b2_hps_exp_01.R} -else +else echo "Invalid model-" $MODEL_NAME exit fi - - diff --git a/workflows/mlrMBO/test/cfg-prm-summit.sh b/workflows/mlrMBO/test/cfg-prm-summit.sh index 31b7b262..fd58b0d0 100644 --- a/workflows/mlrMBO/test/cfg-prm-summit.sh +++ b/workflows/mlrMBO/test/cfg-prm-summit.sh @@ -1,19 +1,25 @@ -# CFG PRM SUMMIT +# CFG PRM 1 # mlrMBO settings # Total iterations -PROPOSE_POINTS=${PROPOSE_POINTS:-64} +PROPOSE_POINTS=${PROPOSE_POINTS:-6} MAX_CONCURRENT_EVALUATIONS=${MAX_CONCURRET_EVALUATIONS:-1} -MAX_ITERATIONS=${MAX_ITERATIONS:-3} -MAX_BUDGET=${MAX_BUDGET:-280} -DESIGN_SIZE=${DESIGN_SIZE:-64} +MAX_ITERATIONS=${MAX_ITERATIONS:-2} +MAX_BUDGET=${MAX_BUDGET:-180} +DESIGN_SIZE=${DESIGN_SIZE:-6} # TODO: move the following code to a utility library- # this is a configuration file # Set the R data file for running if [ "$MODEL_NAME" = "combo" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/combo_nightly.R} +elif [ "$MODEL_NAME" = "attn" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/attn_nightly.R} +elif [ "$MODEL_NAME" = "sct" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/sct_nightly.R} +elif [ "$MODEL_NAME" = "adrp" ]; then + PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/adrp_nightly.R} elif [ "$MODEL_NAME" = "p1b1" ]; then PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/p1b1_nightly.R} elif [ "$MODEL_NAME" = "nt3" ]; then diff --git a/workflows/mlrMBO/test/cfg-sys-1.sh b/workflows/mlrMBO/test/cfg-sys-1.sh index 147cdd7a..5720edcd 100644 --- a/workflows/mlrMBO/test/cfg-sys-1.sh +++ b/workflows/mlrMBO/test/cfg-sys-1.sh @@ -35,6 +35,13 @@ BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} # This timeout is implemented with the shell command 'timeout' export SH_TIMEOUT=${SH_TIMEOUT:-} +# Resident task worker rank for mlrMBO algorithm +if [[ ${TURBINE_RESIDENT_WORK_WORKERS:-} == "" ]] +then + export TURBINE_RESIDENT_WORK_WORKERS=1 + export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) +fi + # Ignore errors: If 1, unknown errors will be reported to model.log # but will not bring down the Swift workflow. See model.sh . export IGNORE_ERRORS=0 diff --git a/workflows/mlrMBO/test/cfg-sys-30.sh b/workflows/mlrMBO/test/cfg-sys-30.sh index f45ef2c5..0994073d 100644 --- a/workflows/mlrMBO/test/cfg-sys-30.sh +++ b/workflows/mlrMBO/test/cfg-sys-30.sh @@ -17,4 +17,3 @@ export WALLTIME=${WALLTIME:-3:00:00} # Benchmark run timeout: benchmark run will timeouT # after the specified number of seconds. -1 is no timeout. BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} - diff --git a/workflows/mlrMBO/test/cfg-sys-gdrp.sh b/workflows/mlrMBO/test/cfg-sys-gdrp.sh new file mode 100644 index 00000000..1a7f8176 --- /dev/null +++ b/workflows/mlrMBO/test/cfg-sys-gdrp.sh @@ -0,0 +1,40 @@ + +# MLRMBO CFG SYS NIGHTLY + +# The number of MPI processes +# Note that 2 processes are reserved for Swift/EMEWS +# The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs +export PROCS=${PROCS:-16} + +# MPI processes per node +# Cori has 32 cores per node, 128GB per node +export PPN=${PPN:-1} +export QUEUE=${QUEUE:-prod} +export WALLTIME=${WALLTIME:-10:55:00} + +# Benchmark run timeout: benchmark run will timeout +# after the specified number of seconds. +# If set to -1 there is no timeout. +# This timeout is implemented with Keras callbacks +BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} + +# Uncomment below to use custom python script to run +# Use file name without .py (e.g, my_script.py) +# MODEL_PYTHON_SCRIPT=my_script + +# Shell timeout: benchmark run will be killed +# after the specified number of seconds. +# If set to -1 or empty there is no timeout. +# This timeout is implemented with the shell command 'timeout' +export SH_TIMEOUT=${SH_TIMEOUT:-} + +# Ignore errors: If 1, unknown errors will be reported to model.log +# but will not bring down the Swift workflow. See model.sh . +export IGNORE_ERRORS=0 + +# Resident task worker rank for mlrMBO algorithm +if [[ ${TURBINE_RESIDENT_WORK_WORKERS:-} == "" ]] +then + export TURBINE_RESIDENT_WORK_WORKERS=1 + export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) +fi diff --git a/workflows/mlrMBO/test/cfg-sys-nightly.sh b/workflows/mlrMBO/test/cfg-sys-nightly.sh index 7933779d..c7d1eaf1 100644 --- a/workflows/mlrMBO/test/cfg-sys-nightly.sh +++ b/workflows/mlrMBO/test/cfg-sys-nightly.sh @@ -1,22 +1,17 @@ -# MLRMBO CFG SYS 1 +# MLRMBO CFG SYS NIGHTLY # The number of MPI processes -# Note that 2 processes are reserved for Swift/EMEMS +# Note that 2 processes are reserved for Swift/EMEWS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-4} +export PROCS=${PROCS:-7} # MPI processes per node # Cori has 32 cores per node, 128GB per node export PPN=${PPN:-1} -# For Theta: -# export QUEUE=${QUEUE:-debug-flat-quad} - -# export WALLTIME=${WALLTIME:-00:10:00} -export WALLTIME=${WALLTIME:-10} - -#export PROJECT=Candle_ECP +export QUEUE=${QUEUE:-debug-scaling} +export WALLTIME=${WALLTIME:-00:60:00} # Benchmark run timeout: benchmark run will timeout # after the specified number of seconds. @@ -37,3 +32,10 @@ export SH_TIMEOUT=${SH_TIMEOUT:-} # Ignore errors: If 1, unknown errors will be reported to model.log # but will not bring down the Swift workflow. See model.sh . export IGNORE_ERRORS=0 + +# Resident task worker rank for mlrMBO algorithm +if [[ ${TURBINE_RESIDENT_WORK_WORKERS:-} == "" ]] +then + export TURBINE_RESIDENT_WORK_WORKERS=1 + export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) +fi diff --git a/workflows/mlrMBO/test/cfg-sys-summit.sh b/workflows/mlrMBO/test/cfg-sys-summit.sh index 59f47ed2..b313f3ad 100644 --- a/workflows/mlrMBO/test/cfg-sys-summit.sh +++ b/workflows/mlrMBO/test/cfg-sys-summit.sh @@ -1,20 +1,17 @@ -# MLRMBO CFG SYS SUMMIT + +# MLRMBO CFG SYS 1 # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEMS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-66} +export PROCS=${PROCS:-6} # MPI processes per node # Cori has 32 cores per node, 128GB per node -export PPN=${PPN:-1} -export TURBINE_DIRECTIVE="#BSUB -alloc_flags \"NVME maximizegpfs\"" -export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" -# For Theta: -# export QUEUE=${QUEUE:-debug-flat-quad} +export PPN=${PPN:-6} -# export WALLTIME=${WALLTIME:-00:10:00} -export WALLTIME=${WALLTIME:-360} +# export WALLTIME=${WALLTIME:-06:00:00} +export WALLTIME=02:00:00 #export PROJECT=Candle_ECP @@ -37,3 +34,14 @@ export SH_TIMEOUT=${SH_TIMEOUT:-} # Ignore errors: If 1, unknown errors will be reported to model.log # but will not bring down the Swift workflow. See model.sh . export IGNORE_ERRORS=0 + +# Resident task worker count and rank list +# If this is already set, we respect the user settings +# If this is unset, we set it to 1 +# and run the algorithm on the 2nd highest rank +# This value is only read in HPO workflows +if [[ ${TURBINE_RESIDENT_WORK_WORKERS:-} == "" ]] +then + export TURBINE_RESIDENT_WORK_WORKERS=1 + export RESIDENT_WORK_RANKS=$(( PROCS - 2 )) +fi diff --git a/workflows/mlrMBO/test/create-new-test.sh b/workflows/mlrMBO/test/create-new-test.sh index 42c82669..83a6858b 100755 --- a/workflows/mlrMBO/test/create-new-test.sh +++ b/workflows/mlrMBO/test/create-new-test.sh @@ -18,6 +18,3 @@ sed -i -e "s/PROPOSE_POINTS:-5/PROPOSE_POINTS:-$1/g" cfg-prm-$1.sh sed -i -e "s/MAX_CONCURRENT_EVALUATIONS:-1/MAX_CONCURRENT_EVALUATIONS:-$1/g" cfg-prm-$1.sh sed -i -e "s/DESIGN_SIZE:-10/DESIGN_SIZE:-$1/g" cfg-prm-$1.sh sed -i -e "s/MAX_BUDGET:-180/MAX_BUDGET:-$Budget/g" cfg-prm-$1.sh - - - diff --git a/workflows/mlrMBO/test/restart-combo.csv b/workflows/mlrMBO/test/restart-combo.csv index 9268db2d..5ebc24c1 100644 --- a/workflows/mlrMBO/test/restart-combo.csv +++ b/workflows/mlrMBO/test/restart-combo.csv @@ -8,4 +8,3 @@ y,batch_size,epochs 41.376564008,512,26 6.5089799458,16,30 20.2991980919,64,23 - diff --git a/workflows/mlrMBO/test/test-1.sh b/workflows/mlrMBO/test/test-1.sh index 029c54f9..22170c6f 100755 --- a/workflows/mlrMBO/test/test-1.sh +++ b/workflows/mlrMBO/test/test-1.sh @@ -5,7 +5,7 @@ set -eu usage() { - echo "Usage: test BENCHMARK_NAME SITE RUN_DIR(optional)" + echo "Usage: test MODEL_NAME SITE RUN_DIR(optional)" echo " RUN_DIR is optional, use -a for automatic" } @@ -32,6 +32,8 @@ WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) source $WORKFLOWS_ROOT/common/sh/utils.sh # Select configurations +# Temporarily hard-coding to graphdrp: +export PARAM_SET_FILE=$THIS/../data/graphdrp_small.R export CFG_SYS=$THIS/cfg-sys-1.sh export CFG_PRM=$THIS/cfg-prm-1.sh @@ -43,7 +45,7 @@ export R_FILE=mlrMBO-ils.R # What to return from the objective function (Keras model) # val_loss (default) and val_corr are supported -export OBJ_RETURN="val_loss" +export MODEL_RETURN="val_loss" if [[ $SITE == "theta" ]] then diff --git a/workflows/mlrMBO/test/test-30.sh b/workflows/mlrMBO/test/test-30.sh index 0b6789fd..304ad5fb 100755 --- a/workflows/mlrMBO/test/test-30.sh +++ b/workflows/mlrMBO/test/test-30.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eu -echo "Usage test-1.sh BECHMARK_NAME SITE RUN_DIR(optional, -a=automatic)" +echo "Usage test-1.sh BECHMARK_NAME SITE RUN_DIR(optional, -a=automatic)" RUN_DIR="" @@ -14,7 +14,7 @@ then echo "Automatically assigning run directory in ../experiments folder" RUN_DIR="-a" else - echo "Usage test SITE RUN_DIR(optional)" + echo "Usage test SITE RUN_DIR(optional)" exit 1 fi diff --git a/workflows/mlrMBO/test/test-gdrp-1.sh b/workflows/mlrMBO/test/test-gdrp-1.sh new file mode 100755 index 00000000..28e137d2 --- /dev/null +++ b/workflows/mlrMBO/test/test-gdrp-1.sh @@ -0,0 +1,39 @@ +#!/bin/bash +set -eu + +# TEST MLRMBO GDRP 1 +# For GraphDRP + +if (( ${#} != 1 )) +then + echo "usage: test SITE" + exit 1 +fi + +SITE=$1 + +# Self-configure +THIS=$( cd $( dirname $0 ) ; /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. ; /bin/pwd ) +WORKFLOWS_ROOT=$( cd $THIS/../.. ; /bin/pwd ) +export EMEWS_PROJECT_ROOT + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-nightly.sh +export CFG_PRM=$THIS/cfg-prm-nightly.sh + +# Specify the mlrMBO algorithm R file +export R_FILE=mlrMBO-mbo.R + +CANDLE_MODEL_TYPE="SINGULARITY" +# CANDLE_IMAGE=/software/improve/images/GraphDRP.sif # lambda +CANDLE_IMAGE=/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif # Polaris + + +export MODEL_NAME="graphdrp" + +# Currently ignored: +export OBJ_RETURN="val_loss" + +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $CFG_PRM $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE diff --git a/workflows/mlrMBO/test/test-graphdrp-lambda0.sh b/workflows/mlrMBO/test/test-graphdrp-lambda0.sh new file mode 100755 index 00000000..35cbbb50 --- /dev/null +++ b/workflows/mlrMBO/test/test-graphdrp-lambda0.sh @@ -0,0 +1,72 @@ +#!/bin/bash +set -eu + +# MLRMBO TEST NIGHTLY + +usage() +{ + echo "Usage: test BENCHMARK_NAME SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 3 )) +then + RUN_DIR=$3 +elif (( ${#} == 2 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export PARAM_SET_FILE=graphdrp_small.R +export CFG_SYS=$THIS/cfg-sys-nightly.sh +export CFG_PRM=$THIS/cfg-prm-nightly.sh + +# Specify the R file for This file must be present in the $EMEWS_PROJECT_ROOT/R +export R_FILE=mlrMBO-mbo.R + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export MODEL_RETURN="val_loss" + +# export CANDLE_MODEL_TYPE="SINGULARITY" +# export CANDLE_IMAGE="/software/improve/images/GraphDRP.sif" +# export INIT_PARAMS_FILE="/software/improve/graphdrp_default_model.txt" + +export CANDLE_MODEL_TYPE="BENCHMARKS" +export CANDLE_IMAGE="NONE" +export CANDLE_MODEL_IMPL="app" + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +echo $TURBINE_OUTPUT +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) +#check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: diff --git a/workflows/mlrMBO/test/test-model-lambda.sh b/workflows/mlrMBO/test/test-model-lambda.sh new file mode 100755 index 00000000..2f12516f --- /dev/null +++ b/workflows/mlrMBO/test/test-model-lambda.sh @@ -0,0 +1,86 @@ +#!/bin/bash +set -eu + +# MLRMBO TEST NIGHTLY + +usage() +{ + echo "Usage: test BENCHMARK_NAME SITE RUN_DIR EXPERIMENT_PARAMATER_FILE" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 4 )) +then + RUN_DIR=$3 +elif (( ${#} == 2 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + + + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-nightly.sh +export CFG_PRM=$THIS/cfg-prm-nightly.sh +export PARAM_SET_FILE=$4 + +# Move experiment config in place - is R file wtf +if [ -f $PARAM_SET_FILE ] +then + echo $WORKFLOWS_ROOT + echo $EMEWS_PROJECT_ROOT + FNAME=$( basename $PARAM_SET_FILE ) + cp $PARAM_SET_FILE $EMEWS_PROJECT_ROOT/data/$FNAME + PARAM_SET_FILE=$FNAME +fi + + +# Specify the R file for This file must be present in the $EMEWS_PROJECT_ROOT/R +export R_FILE=mlrMBO-mbo.R + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export MODEL_RETURN="val_loss" + +if [[ $SITE == "theta" ]] +then + export WAIT=1 +fi + +export CANDLE_MODEL_TYPE="SINGULARITY" +export CANDLE_IMAGE="/software/improve/images/GraphDRP.sif" +export INIT_PARAMS_FILE="/software/improve/graphdrp_default_model.txt" + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM \ + $MODEL_NAME \ + $CANDLE_MODEL_TYPE $CANDLE_IMAGE + +# Check job output +TURBINE_OUTPUT=$( readlink turbine-output ) +echo $TURBINE_OUTPUT +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) +#check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: diff --git a/workflows/mlrMBO/test/test-nightly.sh b/workflows/mlrMBO/test/test-nightly.sh index 53ee507b..378bf8a9 100755 --- a/workflows/mlrMBO/test/test-nightly.sh +++ b/workflows/mlrMBO/test/test-nightly.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eu -# MLRMBO TEST 1 +# MLRMBO TEST NIGHTLY usage() { @@ -47,10 +47,11 @@ then export WAIT=1 fi +export CANDLE_MODEL_TYPE="Benchmarks" + # Submit job $EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME - # Check job output TURBINE_OUTPUT=$( readlink turbine-output ) echo $TURBINE_OUTPUT diff --git a/workflows/mlrMBO/test/test-restart.sh b/workflows/mlrMBO/test/test-restart.sh index 6a172ae3..e242c987 100755 --- a/workflows/mlrMBO/test/test-restart.sh +++ b/workflows/mlrMBO/test/test-restart.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eu -echo "Usage test-1.sh BECHMARK_NAME SITE RUN_DIR(optional, -a=automatic)" +echo "Usage test-1.sh BECHMARK_NAME SITE RUN_DIR(optional, -a=automatic)" RUN_DIR="" @@ -14,7 +14,7 @@ then echo "Automatically assigning run directory in ../experiments folder" RUN_DIR="-a" else - echo "Usage test SITE RUN_DIR(optional)" + echo "Usage test SITE RUN_DIR(optional)" exit 1 fi diff --git a/workflows/one-shot/load.py b/workflows/one-shot/load.py index 43b8b3c0..a77fb647 100644 --- a/workflows/one-shot/load.py +++ b/workflows/one-shot/load.py @@ -1,9 +1,10 @@ - # Performance test for pandas.read_csv() import sys + import pandas as pd F = sys.argv[1] -(pd.read_csv(F, header=None, low_memory=False, usecols=None).values).astype('float32') +(pd.read_csv(F, header=None, low_memory=False, + usecols=None).values).astype("float32") diff --git a/workflows/pbt/Readme.md b/workflows/pbt/Readme.md index e1cd16a2..88e61ef6 100644 --- a/workflows/pbt/Readme.md +++ b/workflows/pbt/Readme.md @@ -1,4 +1,4 @@ -# PBT Workflow # +# PBT Workflow PBT is an asynchronous optimization algorithm for jointly optimizing a population of models and their hyperparameters while effectively using a fixed @@ -6,12 +6,12 @@ computational budget. Like a simple parallel grid search, PBT begins by randomly sampling selected hyperparameters and initial weights and training multiple models in parallel using these hyperparameters and weights. However, unlike a parallel search, each training run periodically and -asynchronously runs an *evaluate* method when a model is considered *ready*, comparing its performance against that +asynchronously runs an _evaluate_ method when a model is considered _ready_, comparing its performance against that of other models. If it is under-performing, PBT uses two additional methods to -improve performance: *exploit* and *explore*. Exploit leverages the work of the +improve performance: _exploit_ and _explore_. Exploit leverages the work of the population as a whole by replacing an underperforming model with a better one, i.e., by replacing a model’s current weights with those of the better performing -model. Explore attempts to find new better performing hyperparameters by +model. Explore attempts to find new better performing hyperparameters by perturbing those of the better performing model. Training then continues with the new weights and the new hyperparameters. Evaluate, exploit, and explore are performed asynchronously and independently by each model for some specified @@ -30,25 +30,26 @@ necessary.) During the explore, a model perturbs the learning rate of the selected better performing model, and then continues training with the new weights and learning rate. -## Requirements ## +## Requirements -* This workflow: git@github.com:ECP-CANDLE/Supervisor.git. Clone and cd to workflows/pbt (the directory containing this README). +- This workflow: git@github.com:ECP-CANDLE/Supervisor.git. Clone and cd to workflows/pbt (the directory containing this README). -* Python: the PBT workflow has been tested under Python 2.7. +- Python: the PBT workflow has been tested under Python 2.7. -* MPI for Python (mpi4py): http://mpi4py.scipy.org/docs/ +- MPI for Python (mpi4py): http://mpi4py.scipy.org/docs/ -* Keras: https://keras.io +- Keras: https://keras.io -* CANDLE Benchmark Code: git@github.com:ECP-CANDLE/Benchmarks.git. Clone and switch to the frameworks branch. +- CANDLE Benchmark Code: git@github.com:ECP-CANDLE/Benchmarks.git. Clone and switch to the frameworks branch. -* TC1 benchmark data: - ``` +- TC1 benchmark data: + + ``` ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/type-class/type_18_300_test.csv ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/type-class/type_18_300_train.csv - ``` + ``` - `type_18_300_train.csv` and `type_18_300_test.csv` should be copied into `X/Benchmarks/Data/Pilot1`, where X is wherever you cloned the Benchmark repository. For example, from within X/Benchmarks + `type_18_300_train.csv` and `type_18_300_test.csv` should be copied into `X/Benchmarks/Data/Pilot1`, where X is wherever you cloned the Benchmark repository. For example, from within X/Benchmarks ``` mkdir -p Data/Pilot1 @@ -57,8 +58,8 @@ weights and learning rate. wget ftp://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/type-class/type_18_300_train.csv ``` +## Running the Workflow -## Running the Workflow ## The PBT workflow is an MPI application that when given N number of processes, runs N - 1 tc1 models, and uses the remaining process to run a datastore into which the models can put and get model peformance data. The workflow can be run using the scripts in the `scripts` directory. Two scripts are provided: `local_run_pbt.sh` and `sbatch_run_pbt.sh`. The former can be used to run on a local desktop or laptop. The latter can be used to submit the PBT workflow on hpc resources @@ -68,9 +69,9 @@ When run the PBT workflow will create an experiments directory in which the outp each tc1 instance writes is model weights every epoch, and an output.csv file that records the accuracy, loss, learning rate, validation accuracy, and validation loss for each model (identified by MPI rank) each epoch. Additionally each tc1 model run will execute within its own `run_N` instance directory (e.g. `run_1`, `run_2` and so forth) within the output directory. -### local_run_pbt.sh ### +### local_run_pbt.sh - `local_run_pbt.sh` takes 3 arguments +`local_run_pbt.sh` takes 3 arguments 1. The number of processes to use 2. An experiment id @@ -79,7 +80,7 @@ that records the accuracy, loss, learning rate, validation accuracy, and validat The experiment id is used to as the name of the experiments directory into which the model output will be written as mentioned above. For example, given the location of the `scripts` directory as `workflows/pbt/scripts` and an experiment id of `r1`, the experiments directory will be `workflows/pbt/experiments/r1`. -### sbatch_run_pbt.sh ### +### sbatch_run_pbt.sh `sbatch_run_pbt.sh` takes 2 arguments: @@ -93,107 +94,119 @@ experiment id of `r1`, the experiments directory will be `workflows/pbt/experime `scripts/pbt.sbatch`. That file can be copied and edited as appropriate, setting the queue, walltime, python, etc. for your HPC machine. It is currently configured for NERSC's Cori system. -### Hyperparameter Configuration File ### +### Hyperparameter Configuration File The PBT workflow uses a json format file for defining the hyperparameter space used by the PBT algorithm. The PBT workflow includes 2 sample hyperparameter configuration files for the tc1 model. -* `data/tc1_params_full.json`: runs the full tc1 model, including the default convolution layer and no feature subsampling. -* `data/tc1_params_small.json`: runs a faster version of the tc1 model by ommitting the convolution layer and subsampling the features. +- `data/tc1_params_full.json`: runs the full tc1 model, including the default convolution layer and no feature subsampling. +- `data/tc1_params_small.json`: runs a faster version of the tc1 model by ommitting the convolution layer and subsampling the features. The hyperparameter configuration file has a json format consisting of a list of json dictionaries, each one of which defines a hyperparameter. Each dictionary has the following required keys: -* name: the name of the hyperparameter (e.g. epochs) -* type: determines how the models are initialized from the named parameter - one of `constant`, `int`, `float`, `logical`, or `categorical`. - * `constant`: all the tc1 models are initialized with the specifed value - * `int`: each tc1 model is initialized with an int randomly drawn from the range defined by `lower` and `upper` bounds - * `float`: each tc1 model is initialized with a float randomly drawn from the range defined by `lower` and `upper` bounds - * `logical`: each tc1 model is initialized with a random boolean. - * `categorical`: each tc1 model is initialized with an element chosen at random from the list of elements in `values`. +- name: the name of the hyperparameter (e.g. epochs) +- type: determines how the models are initialized from the named parameter - one of `constant`, `int`, `float`, `logical`, or `categorical`. + - `constant`: all the tc1 models are initialized with the specifed value + - `int`: each tc1 model is initialized with an int randomly drawn from the range defined by `lower` and `upper` bounds + - `float`: each tc1 model is initialized with a float randomly drawn from the range defined by `lower` and `upper` bounds + - `logical`: each tc1 model is initialized with a random boolean. + - `categorical`: each tc1 model is initialized with an element chosen at random from the list of elements in `values`. The following keys are required depending on value of the `type` key. If the `type` is `constant`: - * `value`: the constant value + +- `value`: the constant value If the `type` is `int`, or `float`: - * `lower`: the lower bound of the range to randomly draw from - * `upper`: the upper bound of the range to randomly draw from + +- `lower`: the lower bound of the range to randomly draw from +- `upper`: the upper bound of the range to randomly draw from If the `type` is `categorical`: - * `values`: the list of elements to randomly choose from - * `element_type`: the type of the elements to choose from. One of `int`, `float`, `string`, or `logical` + +- `values`: the list of elements to randomly choose from +- `element_type`: the type of the elements to choose from. One of `int`, `float`, `string`, or `logical` A sample hyperparameter definition file: ```javascript [ { - "name": "epochs", - "type": "constant", - "value": 5 + name: "epochs", + type: "constant", + value: 5, }, { - "name": "activation", - "type": "categorical", - "element_type": "string", - "values": ["softmax", "elu", "softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear"] + name: "activation", + type: "categorical", + element_type: "string", + values: [ + "softmax", + "elu", + "softplus", + "softsign", + "relu", + "tanh", + "sigmoid", + "hard_sigmoid", + "linear", + ], }, { - "name": "batch_size", - "type": "categorical", - "element_type": "int", - "values": [32, 64] + name: "batch_size", + type: "categorical", + element_type: "int", + values: [32, 64], }, { - "name": "lr", - "type": "float", - "lower": 0.0001, - "upper": 0.01 - } -] + name: "lr", + type: "float", + lower: 0.0001, + upper: 0.01, + }, +]; ``` Note that any other keys are ignored by the workflow but can be used to add additional information about the hyperparameter. For example, the sample files contain a `comment` entry that contains additional information about that hyperparameter. - -## Workflow Explained ## +## Workflow Explained The workflow consists of 3 parts. The DNN tc1 model in `models/tc1`, the PBT python code in `python/pbt.py` and the python code that runs the tc1 model using PBT, `python/tc1_pbt.py`. -### tc1 ### +### tc1 The tc1 model is a lightly modified version of the CANDLE tc1 benchmark. The code has been updated so that an external Keras callback can be passed through `models/tc1/tc1_runner.run()` and attached to the model. The PBT algorithnm is run via this callback. -### `python/pbt.py` ### +### `python/pbt.py` `pbt.py` provides the model-agnostic framework code for implementing a PBT workflow. It has 4 main components. 1. A PBTMetaDataStore class. This implements an in-memory datastore for the model run performance and hyperparamter data. It also manages a locking scheme for model weight file IO in order to prevent issues with concurrent -file access. + file access. 2. A PBTClient class. This allows an individual instance of a model to communicate with the PBTMetaDataStore, sending it peformance data, querying -performance data for a better performing model, requesting read and write locks for reading other model weights and writing its own. The PBTClient and -PBTMetaDataStore communicate via MPI. + performance data for a better performing model, requesting read and write locks for reading other model weights and writing its own. The PBTClient and + PBTMetaDataStore communicate via MPI. 3. A PBTCallback class. This is a Keras callback that given model-specific -*ready*, *exploit*, and *explore* implementations will pass its current performance data to the data store and write its model's weights -every epoch. Then when *ready*, it will perform an an *evaluate* to find a better performing model. Assuming one is found, an *exploit* and *explore* be peformed to update its model's weights and hyperparameters appropriately. A PBTCallback uses a PBTClient to ommunicate with a PBTMetaDataStore. + _ready_, _exploit_, and _explore_ implementations will pass its current performance data to the data store and write its model's weights + every epoch. Then when _ready_, it will perform an an _evaluate_ to find a better performing model. Assuming one is found, an _exploit_ and _explore_ be peformed to update its model's weights and hyperparameters appropriately. A PBTCallback uses a PBTClient to ommunicate with a PBTMetaDataStore. -4. A PBTWorker interface. This interface defines the API for PBT's *ready*, -*exploit* and *explore* steps. Client code implements this interface, -supplying implementations appropriate to that particular workflow. +4. A PBTWorker interface. This interface defines the API for PBT's _ready_, + _exploit_ and _explore_ steps. Client code implements this interface, + supplying implementations appropriate to that particular workflow. -### `python/tc1_pbt.py` ### +### `python/tc1_pbt.py` `tc1_pbt.py` implements PBT for the tc1 model using the classes and functions in `pbt.py`. In `tc1_pbt.py`, rank 0 first generates and distribute the hyperparameters to the models running on the other ranks. The ga_utils package is used to read the hyperparameter definition file (see above) and generate, @@ -204,28 +217,28 @@ PBTMetaDataStore's constructor is passed the path of the output directory where the `output.csv` file will be written together with a the path to a log file in which user customizable log messages are written. PBTMetaDataStore also takes a reference -to an *evaluate* function that is used to evaluate a model's current performance +to an _evaluate_ function that is used to evaluate a model's current performance and select a better performing model. That function -must have the following arguments: a list of dictionaries that contains the metadata for all the models, and a *score* against which model performance is determined. Exactly what the score represents (e.g. the validation loss) is +must have the following arguments: a list of dictionaries that contains the metadata for all the models, and a _score_ against which model performance is determined. Exactly what the score represents (e.g. the validation loss) is domain specific and is provided in the `PBTWorker.pack_data` method described below. -In `tc1_pbt.py`, `truncation_select` implements this *evaluate* function and is passed to the PBTMetaDataStore. In `truncation_select`, if the specified score is in the top 80% of scores, then an empty dictionary is returned. This empty dictionary indicates that a better performing model was not found and thus -*exploit* and *explore* should not occur. If the specified score is in the bottom 20% then the data for a model in the top 20% is random selected -and returned in a python dictionary. The data in this dictionary, the rank of the better performing model and its relevant hyperparameters can then be used in *exploit* and *explore*. +In `tc1_pbt.py`, `truncation_select` implements this _evaluate_ function and is passed to the PBTMetaDataStore. In `truncation_select`, if the specified score is in the top 80% of scores, then an empty dictionary is returned. This empty dictionary indicates that a better performing model was not found and thus +_exploit_ and _explore_ should not occur. If the specified score is in the bottom 20% then the data for a model in the top 20% is random selected +and returned in a python dictionary. The data in this dictionary, the rank of the better performing model and its relevant hyperparameters can then be used in _exploit_ and _explore_. With the PBTMetaDataStore initialized on rank 0, all the remaining processes run the tc1 model. A PBTCallback is added to each one of these models. The PBTCallback constructor requires a instance of a class that implements the PBTWorker interface. A PBTCallback calls the 3 methods of a PBTWorkder to: 1. Retrieve a model's metadata and hyperparameters in order put them in the -PBTMetaDataStore (`PBTWorker.pack_data`), -2. Specifies which performance metric to use as the 'score' for model performance (also in `PBTWorker.pack_data`) in an *evaluate*. -3. Determine when a model is *ready* for a potential exploit and explore (`PBTWorker.ready`), -4. Perform the *exploit* and *explore* update (`PBTWorker.update`). + PBTMetaDataStore (`PBTWorker.pack_data`), +2. Specifies which performance metric to use as the 'score' for model performance (also in `PBTWorker.pack_data`) in an _evaluate_. +3. Determine when a model is _ready_ for a potential exploit and explore (`PBTWorker.ready`), +4. Perform the _exploit_ and _explore_ update (`PBTWorker.update`). In the tc1 PBT workflow, `tc1_pbt.TC1PBTWorker` implements the `PBTWorker` -interface. `TC1PBTWorker.pack_data` retrieves a model's current learning rate, and specifies the validation loss as the performance score. `TC1PBTWorker.ready` specifies that the model is *ready* every 5 epochs. (5 is too soon to begin sharing weights, but it serves as an example and does exercise the workflow code within a reasonable amount of time.) `TC1PBTWorker.update` updates the model with a better performing learning rate after having perturbed it. Note that `update` does not need to load the better performing model's weights. That is done automatically in PBTCallback. +interface. `TC1PBTWorker.pack_data` retrieves a model's current learning rate, and specifies the validation loss as the performance score. `TC1PBTWorker.ready` specifies that the model is _ready_ every 5 epochs. (5 is too soon to begin sharing weights, but it serves as an example and does exercise the workflow code within a reasonable amount of time.) `TC1PBTWorker.update` updates the model with a better performing learning rate after having perturbed it. Note that `update` does not need to load the better performing model's weights. That is done automatically in PBTCallback. In sum then, in a PBTCallback at the end of every epoch: @@ -233,19 +246,19 @@ In sum then, in a PBTCallback at the end of every epoch: 2. `ready` is called to determine if a model is ready for an exploit / explore update. 3. If `ready` returns true, then the PBTCallback queries the PBTMetaDataStore for a better performing model using the supplied evaluate function (e.g. `truncation_select`). 4. If the selection function returns data from a better performing model, then -`update` is called to update the under performing model with the better performing hyperparameters, and the PBTCallback loads the -better performing model's weights into the under performing model. + `update` is called to update the under performing model with the better performing hyperparameters, and the PBTCallback loads the + better performing model's weights into the under performing model. -## Adapting the Workflow to a Different Model ## +## Adapting the Workflow to a Different Model `tc1_pbt.py` can easily be adapted to work with a different model. The following changes will need to be made: -* A new hyperparameter definition file. The rank 0 -code that reads this file can be re-used. +- A new hyperparameter definition file. The rank 0 + code that reads this file can be re-used. -* A new *evaluate* function. This can be passed to the PBTMetaDataStore -constructor in place of `truncation_select` +- A new _evaluate_ function. This can be passed to the PBTMetaDataStore + constructor in place of `truncation_select` -* A new PBTWorker implementation, implementing `ready`, `pack_data`, and -`update` as appropriate for the new model and workflow. This can be -passed to the PBTCallback in place of `TC1PBTWorker`. +- A new PBTWorker implementation, implementing `ready`, `pack_data`, and + `update` as appropriate for the new model and workflow. This can be + passed to the PBTCallback in place of `TC1PBTWorker`. diff --git a/workflows/pbt/data/tc1_params_full.json b/workflows/pbt/data/tc1_params_full.json index a2162270..03582d2a 100644 --- a/workflows/pbt/data/tc1_params_full.json +++ b/workflows/pbt/data/tc1_params_full.json @@ -16,7 +16,17 @@ "name": "activation", "type": "categorical", "element_type": "string", - "values": ["softmax", "elu", "softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear"] + "values": [ + "softmax", + "elu", + "softplus", + "softsign", + "relu", + "tanh", + "sigmoid", + "hard_sigmoid", + "linear" + ] }, { @@ -54,5 +64,4 @@ "type": "constant", "value": 200 } - ] diff --git a/workflows/pbt/data/tc1_params_small.json b/workflows/pbt/data/tc1_params_small.json index ebea49e5..63d60d05 100644 --- a/workflows/pbt/data/tc1_params_small.json +++ b/workflows/pbt/data/tc1_params_small.json @@ -23,7 +23,17 @@ "name": "activation", "type": "categorical", "element_type": "string", - "values": ["softmax", "elu", "softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear"] + "values": [ + "softmax", + "elu", + "softplus", + "softsign", + "relu", + "tanh", + "sigmoid", + "hard_sigmoid", + "linear" + ] }, { @@ -59,5 +69,4 @@ "type": "constant", "value": 5 } - ] diff --git a/workflows/pbt/models/mnist/mnist_cnn.py b/workflows/pbt/models/mnist/mnist_cnn.py index fb79ead0..fed76f2b 100644 --- a/workflows/pbt/models/mnist/mnist_cnn.py +++ b/workflows/pbt/models/mnist/mnist_cnn.py @@ -1,17 +1,16 @@ -'''Trains a simple convnet on the MNIST dataset. +"""Trains a simple convnet on the MNIST dataset. -Gets to 99.25% test accuracy after 12 epochs -(there is still a lot of margin for parameter tuning). -16 seconds per epoch on a GRID K520 GPU. -''' +Gets to 99.25% test accuracy after 12 epochs (there is still a lot of +margin for parameter tuning). 16 seconds per epoch on a GRID K520 GPU. +""" from __future__ import print_function + import keras +from keras import backend as K from keras.datasets import fashion_mnist +from keras.layers import Conv2D, Dense, Dropout, Flatten, MaxPooling2D from keras.models import Sequential -from keras.layers import Dense, Dropout, Flatten -from keras.layers import Conv2D, MaxPooling2D -from keras import backend as K batch_size = 128 num_classes = 10 @@ -23,7 +22,7 @@ # the data, split between train and test sets (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data() -if K.image_data_format() == 'channels_first': +if K.image_data_format() == "channels_first": x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) input_shape = (1, img_rows, img_cols) @@ -32,39 +31,43 @@ x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) input_shape = (img_rows, img_cols, 1) -x_train = x_train.astype('float32') -x_test = x_test.astype('float32') +x_train = x_train.astype("float32") +x_test = x_test.astype("float32") x_train /= 255 x_test /= 255 -print('x_train shape:', x_train.shape) -print(x_train.shape[0], 'train samples') -print(x_test.shape[0], 'test samples') +print("x_train shape:", x_train.shape) +print(x_train.shape[0], "train samples") +print(x_test.shape[0], "test samples") # convert class vectors to binary class matrices y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) model = Sequential() -model.add(Conv2D(32, kernel_size=(3, 3), - activation='relu', - input_shape=input_shape)) -model.add(Conv2D(64, (3, 3), activation='relu')) +model.add( + Conv2D(32, kernel_size=(3, 3), activation="relu", input_shape=input_shape)) +model.add(Conv2D(64, (3, 3), activation="relu")) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.25)) model.add(Flatten()) -model.add(Dense(128, activation='relu')) +model.add(Dense(128, activation="relu")) model.add(Dropout(0.5)) -model.add(Dense(num_classes, activation='softmax')) +model.add(Dense(num_classes, activation="softmax")) -model.compile(loss=keras.losses.categorical_crossentropy, - optimizer=keras.optimizers.Adadelta(), - metrics=['accuracy']) +model.compile( + loss=keras.losses.categorical_crossentropy, + optimizer=keras.optimizers.Adadelta(), + metrics=["accuracy"], +) -model.fit(x_train, y_train, - batch_size=batch_size, - epochs=epochs, - verbose=1, - validation_data=(x_test, y_test)) +model.fit( + x_train, + y_train, + batch_size=batch_size, + epochs=epochs, + verbose=1, + validation_data=(x_test, y_test), +) score = model.evaluate(x_test, y_test, verbose=0) -print('Test loss:', score[0]) -print('Test accuracy:', score[1]) +print("Test loss:", score[0]) +print("Test accuracy:", score[1]) diff --git a/workflows/pbt/models/tc1/tc1_baseline_keras2.py b/workflows/pbt/models/tc1/tc1_baseline_keras2.py index 6a993d90..dc95132c 100644 --- a/workflows/pbt/models/tc1/tc1_baseline_keras2.py +++ b/workflows/pbt/models/tc1/tc1_baseline_keras2.py @@ -1,49 +1,62 @@ -import pandas as pd -import numpy as np +import argparse +import gzip import os import sys -import gzip -import argparse + +import numpy as np +import pandas as pd + try: import configparser except ImportError: import ConfigParser as configparser from keras import backend as K - -from keras.layers import Input, Dense, Dropout, Activation, Conv1D, MaxPooling1D, Flatten, LocallyConnected1D +from keras.callbacks import CSVLogger, ModelCheckpoint, ReduceLROnPlateau +from keras.layers import ( + Activation, + Conv1D, + Dense, + Dropout, + Flatten, + Input, + LocallyConnected1D, + MaxPooling1D, +) +from keras.models import Model, Sequential, model_from_json, model_from_yaml from keras.optimizers import SGD, Adam, RMSprop -from keras.models import Sequential, Model, model_from_json, model_from_yaml from keras.utils import np_utils -from keras.callbacks import ModelCheckpoint, CSVLogger, ReduceLROnPlateau - from sklearn.metrics import accuracy_score -from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler +from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler file_path = os.path.dirname(os.path.realpath(__file__)) -lib_path = os.path.abspath(os.path.join(file_path, '..', 'common')) +lib_path = os.path.abspath(os.path.join(file_path, "..", "common")) sys.path.append(lib_path) -lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) +lib_path2 = os.path.abspath(os.path.join(file_path, "..", "..", "common")) sys.path.append(lib_path2) import data_utils import p1_common - from solr_keras import CandleRemoteMonitor, TerminateOnTimeOut -#EPOCH = 400 -#BATCH = 20 -#CLASSES = 2 +# EPOCH = 400 +# BATCH = 20 +# CLASSES = 2 + +# PL = 60484 # 1 + 60483 these are the width of the RNAseq datasets +# P = 60483 # 60483 +# DR = 0.1 # Dropout rate -#PL = 60484 # 1 + 60483 these are the width of the RNAseq datasets -#P = 60483 # 60483 -#DR = 0.1 # Dropout rate def common_parser(parser): - parser.add_argument("--config_file", dest='config_file', type=str, - default=os.path.join(file_path, 'tc1_default_model.txt'), - help="specify model configuration file") + parser.add_argument( + "--config_file", + dest="config_file", + type=str, + default=os.path.join(file_path, "tc1_default_model.txt"), + help="specify model configuration file", + ) # Parse has been split between arguments that are common with the default neon parser # and all the other options @@ -52,50 +65,57 @@ def common_parser(parser): return parser + def get_tc1_parser(): - parser = argparse.ArgumentParser(prog='tc1_baseline', formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description='Train Autoencoder - Pilot 1 Benchmark 1') + parser = argparse.ArgumentParser( + prog="tc1_baseline", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="Train Autoencoder - Pilot 1 Benchmark 1", + ) + + return common_parser(parser) - return common_parser(parser) def read_config_file(file): - config=configparser.ConfigParser() + config = configparser.ConfigParser() config.read(file) - section=config.sections() - fileParams={} - - fileParams['data_url']=eval(config.get(section[0],'data_url')) - fileParams['train_data']=eval(config.get(section[0],'train_data')) - fileParams['test_data']=eval(config.get(section[0],'test_data')) - fileParams['model_name']=eval(config.get(section[0],'model_name')) - fileParams['conv']=eval(config.get(section[0],'conv')) - fileParams['dense']=eval(config.get(section[0],'dense')) - fileParams['activation']=eval(config.get(section[0],'activation')) - fileParams['out_act']=eval(config.get(section[0],'out_act')) - fileParams['loss']=eval(config.get(section[0],'loss')) - fileParams['optimizer']=eval(config.get(section[0],'optimizer')) - fileParams['feature_subsample']=eval(config.get(section[0],'feature_subsample')) - fileParams['metrics']=eval(config.get(section[0],'metrics')) - fileParams['epochs']=eval(config.get(section[0],'epochs')) - fileParams['batch_size']=eval(config.get(section[0],'batch_size')) - fileParams['drop']=eval(config.get(section[0],'drop')) - fileParams['classes']=eval(config.get(section[0],'classes')) - fileParams['pool']=eval(config.get(section[0],'pool')) - fileParams['save']=eval(config.get(section[0], 'save')) - fileParams['lr']=eval(config.get(section[0], 'lr')) - fileParams['timeout']=eval(config.get(section[0], 'timeout')) + section = config.sections() + fileParams = {} + + fileParams["data_url"] = eval(config.get(section[0], "data_url")) + fileParams["train_data"] = eval(config.get(section[0], "train_data")) + fileParams["test_data"] = eval(config.get(section[0], "test_data")) + fileParams["model_name"] = eval(config.get(section[0], "model_name")) + fileParams["conv"] = eval(config.get(section[0], "conv")) + fileParams["dense"] = eval(config.get(section[0], "dense")) + fileParams["activation"] = eval(config.get(section[0], "activation")) + fileParams["out_act"] = eval(config.get(section[0], "out_act")) + fileParams["loss"] = eval(config.get(section[0], "loss")) + fileParams["optimizer"] = eval(config.get(section[0], "optimizer")) + fileParams["feature_subsample"] = eval( + config.get(section[0], "feature_subsample")) + fileParams["metrics"] = eval(config.get(section[0], "metrics")) + fileParams["epochs"] = eval(config.get(section[0], "epochs")) + fileParams["batch_size"] = eval(config.get(section[0], "batch_size")) + fileParams["drop"] = eval(config.get(section[0], "drop")) + fileParams["classes"] = eval(config.get(section[0], "classes")) + fileParams["pool"] = eval(config.get(section[0], "pool")) + fileParams["save"] = eval(config.get(section[0], "save")) + fileParams["lr"] = eval(config.get(section[0], "lr")) + fileParams["timeout"] = eval(config.get(section[0], "timeout")) return fileParams + def initialize_parameters(): # Get command-line parameters parser = get_tc1_parser() args = parser.parse_args() - #print('Args:', args) + # print('Args:', args) # Get parameters from configuration file fileParameters = read_config_file(args.config_file) - #print ('Params:', fileParameters) + # print ('Params:', fileParameters) # Consolidate parameter set. Command-line parameters overwrite file configuration gParameters = p1_common.args_overwrite_config(args, fileParameters) return gParameters @@ -103,31 +123,33 @@ def initialize_parameters(): def load_data(train_path, test_path, gParameters): - print('Loading data...') - if gParameters['feature_subsample'] > 0: - usecols = list(range(gParameters['feature_subsample'])) + print("Loading data...") + if gParameters["feature_subsample"] > 0: + usecols = list(range(gParameters["feature_subsample"])) else: usecols = None - df_train = (pd.read_csv(train_path, header=None, usecols=usecols).values).astype('float32') - df_test = (pd.read_csv(test_path, header=None, usecols=usecols).values).astype('float32') - print('done') + df_train = (pd.read_csv(train_path, header=None, + usecols=usecols).values).astype("float32") + df_test = (pd.read_csv(test_path, header=None, + usecols=usecols).values).astype("float32") + print("done") - print('df_train shape:', df_train.shape) - print('df_test shape:', df_test.shape) + print("df_train shape:", df_train.shape) + print("df_test shape:", df_test.shape) seqlen = df_train.shape[1] - df_y_train = df_train[:,0].astype('int') - df_y_test = df_test[:,0].astype('int') + df_y_train = df_train[:, 0].astype("int") + df_y_test = df_test[:, 0].astype("int") - Y_train = np_utils.to_categorical(df_y_train,gParameters['classes']) - Y_test = np_utils.to_categorical(df_y_test,gParameters['classes']) + Y_train = np_utils.to_categorical(df_y_train, gParameters["classes"]) + Y_test = np_utils.to_categorical(df_y_test, gParameters["classes"]) df_x_train = df_train[:, 1:seqlen].astype(np.float32) df_x_test = df_test[:, 1:seqlen].astype(np.float32) -# X_train = df_x_train.as_matrix() -# X_test = df_x_test.as_matrix() + # X_train = df_x_train.as_matrix() + # X_test = df_x_test.as_matrix() X_train = df_x_train X_test = df_x_test @@ -144,22 +166,27 @@ def load_data(train_path, test_path, gParameters): def run(gParameters, callbacks): - print ('Params:', gParameters) + print("Params:", gParameters) - file_train = gParameters['train_data'] - file_test = gParameters['test_data'] - url = gParameters['data_url'] + file_train = gParameters["train_data"] + file_test = gParameters["test_data"] + url = gParameters["data_url"] - train_file = data_utils.get_file(file_train, url+file_train, cache_subdir='Pilot1') - test_file = data_utils.get_file(file_test, url+file_test, cache_subdir='Pilot1') + train_file = data_utils.get_file(file_train, + url + file_train, + cache_subdir="Pilot1") + test_file = data_utils.get_file(file_test, + url + file_test, + cache_subdir="Pilot1") - X_train, Y_train, X_test, Y_test = load_data(train_file, test_file, gParameters) + X_train, Y_train, X_test, Y_test = load_data(train_file, test_file, + gParameters) - print('X_train shape:', X_train.shape) - print('X_test shape:', X_test.shape) + print("X_train shape:", X_train.shape) + print("X_test shape:", X_test.shape) - print('Y_train shape:', Y_train.shape) - print('Y_test shape:', Y_test.shape) + print("Y_train shape:", Y_train.shape) + print("Y_test shape:", Y_test.shape) x_train_len = X_train.shape[1] @@ -168,116 +195,139 @@ def run(gParameters, callbacks): X_train = np.expand_dims(X_train, axis=2) X_test = np.expand_dims(X_test, axis=2) - print('X_train shape:', X_train.shape) - print('X_test shape:', X_test.shape) + print("X_train shape:", X_train.shape) + print("X_test shape:", X_test.shape) model = Sequential() dense_first = True - layer_list = list(range(0, len(gParameters['conv']), 3)) + layer_list = list(range(0, len(gParameters["conv"]), 3)) for l, i in enumerate(layer_list): - filters = gParameters['conv'][i] - filter_len = gParameters['conv'][i+1] - stride = gParameters['conv'][i+2] - print(i/3, filters, filter_len, stride) - if gParameters['pool']: - pool_list=gParameters['pool'] + filters = gParameters["conv"][i] + filter_len = gParameters["conv"][i + 1] + stride = gParameters["conv"][i + 2] + print(i / 3, filters, filter_len, stride) + if gParameters["pool"]: + pool_list = gParameters["pool"] if type(pool_list) != list: - pool_list=list(pool_list) + pool_list = list(pool_list) if filters <= 0 or filter_len <= 0 or stride <= 0: - break + break dense_first = False - if 'locally_connected' in gParameters: - model.add(LocallyConnected1D(filters, filter_len, strides=stride, padding='valid', input_shape=(x_train_len, 1))) + if "locally_connected" in gParameters: + model.add( + LocallyConnected1D( + filters, + filter_len, + strides=stride, + padding="valid", + input_shape=(x_train_len, 1), + )) else: - #input layer + # input layer if i == 0: - model.add(Conv1D(filters=filters, kernel_size=filter_len, strides=stride, padding='valid', input_shape=(x_train_len, 1))) + model.add( + Conv1D( + filters=filters, + kernel_size=filter_len, + strides=stride, + padding="valid", + input_shape=(x_train_len, 1), + )) else: - model.add(Conv1D(filters=filters, kernel_size=filter_len, strides=stride, padding='valid')) - model.add(Activation(gParameters['activation'])) - if gParameters['pool']: - model.add(MaxPooling1D(pool_size=pool_list[i//3])) + model.add( + Conv1D( + filters=filters, + kernel_size=filter_len, + strides=stride, + padding="valid", + )) + model.add(Activation(gParameters["activation"])) + if gParameters["pool"]: + model.add(MaxPooling1D(pool_size=pool_list[i // 3])) if not dense_first: model.add(Flatten()) - for i, layer in enumerate(gParameters['dense']): + for i, layer in enumerate(gParameters["dense"]): if layer: if i == 0 and dense_first: model.add(Dense(layer, input_shape=(x_train_len, 1))) else: model.add(Dense(layer)) - model.add(Activation(gParameters['activation'])) - if gParameters['drop']: - model.add(Dropout(gParameters['drop'])) + model.add(Activation(gParameters["activation"])) + if gParameters["drop"]: + model.add(Dropout(gParameters["drop"])) if dense_first: model.add(Flatten()) - model.add(Dense(gParameters['classes'])) - - model.add(Activation(gParameters['out_act'])) - -#Reference case -#model.add(Conv1D(filters=128, kernel_size=20, strides=1, padding='valid', input_shape=(P, 1))) -#model.add(Activation('relu')) -#model.add(MaxPooling1D(pool_size=1)) -#model.add(Conv1D(filters=128, kernel_size=10, strides=1, padding='valid')) -#model.add(Activation('relu')) -#model.add(MaxPooling1D(pool_size=10)) -#model.add(Flatten()) -#model.add(Dense(200)) -#model.add(Activation('relu')) -#model.add(Dropout(0.1)) -#model.add(Dense(20)) -#model.add(Activation('relu')) -#model.add(Dropout(0.1)) -#model.add(Dense(CLASSES)) -#model.add(Activation('softmax')) + model.add(Dense(gParameters["classes"])) + + model.add(Activation(gParameters["out_act"])) + + # Reference case + # model.add(Conv1D(filters=128, kernel_size=20, strides=1, padding='valid', input_shape=(P, 1))) + # model.add(Activation('relu')) + # model.add(MaxPooling1D(pool_size=1)) + # model.add(Conv1D(filters=128, kernel_size=10, strides=1, padding='valid')) + # model.add(Activation('relu')) + # model.add(MaxPooling1D(pool_size=10)) + # model.add(Flatten()) + # model.add(Dense(200)) + # model.add(Activation('relu')) + # model.add(Dropout(0.1)) + # model.add(Dense(20)) + # model.add(Activation('relu')) + # model.add(Dropout(0.1)) + # model.add(Dense(CLASSES)) + # model.add(Activation('softmax')) model.summary() # ["adam", "rmsprop"] - lr = gParameters['lr'] - if gParameters['optimizer'] == 'adam': + lr = gParameters["lr"] + if gParameters["optimizer"] == "adam": optimizer = Adam(lr=lr) - elif gParameters['optimizer'] == 'rmsprop': + elif gParameters["optimizer"] == "rmsprop": optimizer = RMSprop(lr=lr) - model.compile(loss=gParameters['loss'], - optimizer=optimizer, - metrics=[gParameters['metrics']]) + model.compile(loss=gParameters["loss"], + optimizer=optimizer, + metrics=[gParameters["metrics"]]) - output_dir = gParameters['save'] + output_dir = gParameters["save"] if not os.path.exists(output_dir): os.makedirs(output_dir) -# set up a bunch of callbacks to do work during model training.. + # set up a bunch of callbacks to do work during model training.. - model_name = gParameters['model_name'] - path = '{}/{}.autosave.model.h5'.format(output_dir, model_name) + model_name = gParameters["model_name"] + path = "{}/{}.autosave.model.h5".format(output_dir, model_name) # checkpointer = ModelCheckpoint(filepath=path, verbose=1, save_weights_only=False, save_best_only=True) # csv_logger = CSVLogger('{}/training.log'.format(output_dir)) # reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0) candleRemoteMonitor = CandleRemoteMonitor(params=gParameters) - #callbacks.append(reduce_lr) - #timeout = 3600 - #timeoutMonitor = TerminateOnTimeOut(timeout) + # callbacks.append(reduce_lr) + # timeout = 3600 + # timeoutMonitor = TerminateOnTimeOut(timeout) callbacks.append(candleRemoteMonitor) - #callbacks.append(timeoutMonitor) - history = model.fit(X_train, Y_train, - batch_size=gParameters['batch_size'], - epochs=gParameters['epochs'], - verbose=0, - validation_data=(X_test, Y_test), - callbacks = callbacks) + # callbacks.append(timeoutMonitor) + history = model.fit( + X_train, + Y_train, + batch_size=gParameters["batch_size"], + epochs=gParameters["epochs"], + verbose=0, + validation_data=(X_test, Y_test), + callbacks=callbacks, + ) score = model.evaluate(X_test, Y_test, verbose=0) - print('Test score:', score[0]) - print('Test accuracy:', score[1]) + print("Test score:", score[0]) + print("Test accuracy:", score[1]) # serialize model to JSON # model_json = model.to_json() @@ -342,14 +392,16 @@ def run(gParameters, callbacks): return history + def main(): gParameters = initialize_parameters() run(gParameters) -if __name__ == '__main__': + +if __name__ == "__main__": main() try: K.clear_session() - except AttributeError: # theano does not have this function + except AttributeError: # theano does not have this function pass diff --git a/workflows/pbt/models/tc1/tc1_runner.py b/workflows/pbt/models/tc1/tc1_runner.py index 3a0b24b0..1e072fd0 100644 --- a/workflows/pbt/models/tc1/tc1_runner.py +++ b/workflows/pbt/models/tc1/tc1_runner.py @@ -1,30 +1,41 @@ # tensoflow.__init__ calls _os.path.basename(_sys.argv[0]) # so we need to create a synthetic argv. import sys -if not hasattr(sys, 'argv'): - sys.argv = ['nt3_tc1'] +if not hasattr(sys, "argv"): + sys.argv = ["nt3_tc1"] + +import importlib import json import os + +import log_tools import numpy as np -import importlib import runner_utils -import log_tools logger = None + def import_pkg(framework, model_name): - if framework == 'keras': + if framework == "keras": module_name = "{}_baseline_keras2".format(model_name) pkg = importlib.import_module(module_name) from keras import backend as K - if K.backend() == 'tensorflow' and 'NUM_INTER_THREADS' in os.environ: + + if K.backend() == "tensorflow" and "NUM_INTER_THREADS" in os.environ: import tensorflow as tf - print("Configuring tensorflow with {} inter threads and {} intra threads". - format(os.environ['NUM_INTER_THREADS'], os.environ['NUM_INTRA_THREADS'])) - session_conf = tf.ConfigProto(inter_op_parallelism_threads=int(os.environ['NUM_INTER_THREADS']), - intra_op_parallelism_threads=int(os.environ['NUM_INTRA_THREADS'])) + + print( + "Configuring tensorflow with {} inter threads and {} intra threads" + .format(os.environ["NUM_INTER_THREADS"], + os.environ["NUM_INTRA_THREADS"])) + session_conf = tf.ConfigProto( + inter_op_parallelism_threads=int( + os.environ["NUM_INTER_THREADS"]), + intra_op_parallelism_threads=int( + os.environ["NUM_INTRA_THREADS"]), + ) sess = tf.Session(graph=tf.get_default_graph(), config=session_conf) K.set_session(sess) # elif framework is 'mxnet': @@ -37,21 +48,22 @@ def import_pkg(framework, model_name): raise ValueError("Invalid framework: {}".format(framework)) return pkg + def run(hyper_parameter_map, callbacks): global logger logger = log_tools.get_logger(logger, __name__) - framework = hyper_parameter_map['framework'] - model_name = hyper_parameter_map['model_name'] + framework = hyper_parameter_map["framework"] + model_name = hyper_parameter_map["model_name"] pkg = import_pkg(framework, model_name) runner_utils.format_params(hyper_parameter_map) # params is python dictionary params = pkg.initialize_parameters() - for k,v in hyper_parameter_map.items(): - #if not k in params: + for k, v in hyper_parameter_map.items(): + # if not k in params: # raise Exception("Parameter '{}' not found in set of valid arguments".format(k)) params[k] = v @@ -61,12 +73,13 @@ def run(hyper_parameter_map, callbacks): runner_utils.keras_clear_session(framework) # use the last validation_loss as the value to minimize - val_loss = history.history['val_loss'] + val_loss = history.history["val_loss"] result = val_loss[-1] print("result: ", result) return result -if __name__ == '__main__': + +if __name__ == "__main__": logger = log_tools.get_logger(logger, __name__) logger.debug("RUN START") @@ -77,13 +90,14 @@ def run(hyper_parameter_map, callbacks): exp_id = sys.argv[5] run_id = sys.argv[6] benchmark_timeout = int(sys.argv[7]) - hyper_parameter_map = runner_utils.init(param_string, instance_directory, framework, 'save') - hyper_parameter_map['model_name'] = model_name - hyper_parameter_map['experiment_id'] = exp_id - hyper_parameter_map['run_id'] = run_id - hyper_parameter_map['timeout'] = benchmark_timeout + hyper_parameter_map = runner_utils.init(param_string, instance_directory, + framework, "save") + hyper_parameter_map["model_name"] = model_name + hyper_parameter_map["experiment_id"] = exp_id + hyper_parameter_map["run_id"] = run_id + hyper_parameter_map["timeout"] = benchmark_timeout # clear sys.argv so that argparse doesn't object - sys.argv = ['nt3_tc1_runner'] + sys.argv = ["nt3_tc1_runner"] result = run(hyper_parameter_map) runner_utils.write_output(result, instance_directory) logger.debug("RUN STOP") diff --git a/workflows/pbt/python/file_test.py b/workflows/pbt/python/file_test.py index d1a8cf09..441cfe87 100644 --- a/workflows/pbt/python/file_test.py +++ b/workflows/pbt/python/file_test.py @@ -1,22 +1,25 @@ -import time, random, sys -from mpi4py import MPI -from pbt_utils import PBTMetaDataStore, PBTClient, Timer +import random +import sys +import time import keras from keras import backend as K +from mpi4py import MPI +from pbt_utils import PBTClient, PBTMetaDataStore, Timer GET = 0 PUT = 1 + def r2(y_true, y_pred): - SS_res = K.sum(K.square(y_true - y_pred)) + SS_res = K.sum(K.square(y_true - y_pred)) SS_tot = K.sum(K.square(y_true - K.mean(y_true))) - return (1 - SS_res/(SS_tot + K.epsilon())) + return 1 - SS_res / (SS_tot + K.epsilon()) def run(comm, worker_comm, model_file): client = PBTClient(comm, 0) - model = keras.models.load_model(model_file, custom_objects={'r2' : r2}) + model = keras.models.load_model(model_file, custom_objects={"r2": r2}) timer = Timer("./timings_{}.csv".format(client.rank)) timer.start() @@ -48,6 +51,7 @@ def run(comm, worker_comm, model_file): timer.close() client.done() + def main(model_file): comm = MPI.COMM_WORLD rank = comm.Get_rank() @@ -60,5 +64,6 @@ def main(model_file): else: run(comm, worker_comm, model_file) -if __name__ == '__main__': + +if __name__ == "__main__": main(sys.argv[1]) diff --git a/workflows/pbt/python/pbt.py b/workflows/pbt/python/pbt.py index 1b0548b8..85116e45 100644 --- a/workflows/pbt/python/pbt.py +++ b/workflows/pbt/python/pbt.py @@ -1,11 +1,13 @@ -from mpi4py import MPI -import time, math, ctypes - +import ctypes +import math +import os.path +import random +import time from collections import deque -import random, os.path import keras import pbt_utils +from mpi4py import MPI try: import cPickle as pkl @@ -18,15 +20,13 @@ from io import BytesIO as IO - class Timer: def __init__(self, fname=None): if fname == None: self.out = None else: - self.out = open(fname, 'w') - + self.out = open(fname, "w") def start(self): self.t = time.time() @@ -45,7 +45,18 @@ def close(self): class MsgType: - LOCKED, UNLOCKED, ACQUIRE_READ_LOCK, RELEASE_READ_LOCK, ACQUIRE_WRITE_LOCK, RELEASE_WRITE_LOCK, GET_DATA, PUT_DATA, LOG, DONE = range(10) + ( + LOCKED, + UNLOCKED, + ACQUIRE_READ_LOCK, + RELEASE_READ_LOCK, + ACQUIRE_WRITE_LOCK, + RELEASE_WRITE_LOCK, + GET_DATA, + PUT_DATA, + LOG, + DONE, + ) = range(10) class Tags: @@ -53,13 +64,12 @@ class Tags: class PBTClient: - """Client of the PBTMetaDataStore, used to request locks, and put and get data - from a PBTMetaDataStore. - """ + """Client of the PBTMetaDataStore, used to request locks, and put and get + data from a PBTMetaDataStore.""" def __init__(self, comm, dest, outdir): - """Initializes the PBT client with a communicator and the destination rank - of the PBTMetaDataStore + """Initializes the PBT client with a communicator and the destination + rank of the PBTMetaDataStore. :param comm: the communicator to use to send / recv messages to the PBTMetaDataStore :param dest: the rank of the PBTMetaDataStore @@ -78,13 +88,13 @@ def acquire_read_lock(self, for_rank): :param for_rank: the rank of the weights file to acquire the lock for. """ - msg = {'type' : MsgType.ACQUIRE_READ_LOCK, 'rank' : for_rank} + msg = {"type": MsgType.ACQUIRE_READ_LOCK, "rank": for_rank} status = MPI.Status() - #print("{} requesting read lock: {}".format(self.rank, msg)) + # print("{} requesting read lock: {}".format(self.rank, msg)) self.comm.send(msg, dest=self.dest, tag=Tags.REQUEST) # wait for acknowledgement of lock self.comm.recv(source=self.dest, tag=Tags.ACK, status=status) - #print("{} acquired read lock".format(self.rank)) + # print("{} acquired read lock".format(self.rank)) def release_read_lock(self, for_rank): """Releases a previously acquired read lock for the weights file @@ -92,27 +102,28 @@ def release_read_lock(self, for_rank): :param for_rank: the rank of the weights file to release the lock for. """ - msg = {'type' : MsgType.RELEASE_READ_LOCK, 'rank' : for_rank} + msg = {"type": MsgType.RELEASE_READ_LOCK, "rank": for_rank} status = MPI.Status() self.comm.send(msg, dest=self.dest, tag=Tags.REQUEST) # wait for acknowledgement of lock release self.comm.recv(source=self.dest, tag=Tags.ACK, status=status) def release_write_lock(self, for_rank): - """Releases the write lock for the specified rank that has been acquired - by the put_data call. + """Releases the write lock for the specified rank that has been + acquired by the put_data call. :param for_rank: the rank of the weights file to release the write lock for. """ - msg = {'type' : MsgType.RELEASE_WRITE_LOCK, 'rank' : for_rank} + msg = {"type": MsgType.RELEASE_WRITE_LOCK, "rank": for_rank} status = MPI.Status() self.comm.send(msg, dest=self.dest, tag=Tags.REQUEST) # wait for acknowledgement of lock release self.comm.recv(source=self.dest, tag=Tags.ACK, status=status) def get_data(self, score, lock_weights=True): - """Gets the metadata for a better performing model, assuming there is one. + """Gets the metadata for a better performing model, assuming there is + one. Given a score against which to evaluate model performance, this will return the metadata for a better performing model as dictionary. If there is no better @@ -132,13 +143,17 @@ def get_data(self, score, lock_weights=True): a model's performance ('score'). The dictionary will also contain whatever model hyperparameters client code puts in the datastore. """ - msg = {'type' : MsgType.GET_DATA, 'lock_weights' : lock_weights, 'score': score} + msg = { + "type": MsgType.GET_DATA, + "lock_weights": lock_weights, + "score": score + } self.comm.send(msg, dest=self.dest, tag=Tags.REQUEST) status = MPI.Status() result = self.comm.recv(source=self.dest, tag=Tags.SCORE, status=status) if len(result) and lock_weights: - self.comm.recv(source=self.dest, tag=Tags.ACK, status=status) - #print{"{} acquired weights lock".format(self.rank)) + self.comm.recv(source=self.dest, tag=Tags.ACK, status=status) + # print{"{} acquired weights lock".format(self.rank)) return result def put_data(self, data, lock_weights=True): @@ -152,43 +167,46 @@ def put_data(self, data, lock_weights=True): :param lock_weights: if True this method will also acquire the write lock for the weights file associated with the rank in the data dictionary. """ - msg = {'type' : MsgType.PUT_DATA, 'data' : data, - 'lock_weights' : lock_weights} + msg = { + "type": MsgType.PUT_DATA, + "data": data, + "lock_weights": lock_weights + } self.comm.send(msg, dest=self.dest, tag=Tags.REQUEST) # don't return until the score has actually been put self.comm.recv(source=self.dest, tag=Tags.ACK) status = MPI.Status() if lock_weights: self.comm.recv(source=self.dest, tag=Tags.ACK, status=status) - #print{"{} acquired weights lock".format(self.rank)) + # print{"{} acquired weights lock".format(self.rank)) def log(self, log): - """Logs the specified log message. - """ - msg = {'type': MsgType.LOG, 'log': log} + """Logs the specified log message.""" + msg = {"type": MsgType.LOG, "log": log} self.comm.send(msg, dest=self.dest, tag=Tags.REQUEST) def done(self): - """Notifies the PBTMetaDataStore that model associated with this PBTClient - is finished. + """Notifies the PBTMetaDataStore that model associated with this + PBTClient is finished. - No more PBTClient calls should be made after this method is called. + No more PBTClient calls should be made after this method is + called. """ - msg = {'type' : MsgType.DONE} + msg = {"type": MsgType.DONE} self.comm.send(msg, dest=self.dest, tag=Tags.REQUEST) - def put(self, data, model): self.put_data(data) - #model.save_weights("{}/weights_{}.h5".format(self.outdir, self.rank)) + # model.save_weights("{}/weights_{}.h5".format(self.outdir, self.rank)) pbt_utils.save_state(model, self.outdir, self.rank) self.release_write_lock(self.rank) def load_state(self, model, data, read_rank): pbt_utils.load_state(model, self.outdir, read_rank) - #model.load_weights("{}/weights_{}.h5".format(self.outdir, read_rank)) + # model.load_weights("{}/weights_{}.h5".format(self.outdir, read_rank)) self.release_read_lock(read_rank) + class DataSpacesPBTClient(PBTClient): def __init__(self, comm, dest, outdir): @@ -220,15 +238,17 @@ def make_comm_arg(self, comm): def put(self, data, model): weights = pkl.dumps(model.get_weights(), pkl.HIGHEST_PROTOCOL) weights_size = len(weights) - data['_weights_size_'] = weights_size + data["_weights_size_"] = weights_size self.put_data(data) - self.lib.pbt_ds_put_weights(self.rank, weights, weights_size, self.mpi_comm_self) + self.lib.pbt_ds_put_weights(self.rank, weights, weights_size, + self.mpi_comm_self) self.release_write_lock(self.rank) def load_weights(self, model, data, read_rank): - weights_size = data['_weights_size_'] + weights_size = data["_weights_size_"] str_weights = ctypes.create_string_buffer(weights_size) - self.lib.pbt_ds_get_weights(read_rank, str_weights, weights_size, self.mpi_comm_self) + self.lib.pbt_ds_get_weights(read_rank, str_weights, weights_size, + self.mpi_comm_self) model.set_weights(pkl.load(IO(str_weights))) self.release_read_lock(read_rank) @@ -239,8 +259,7 @@ def done(self): class DataStoreLock: - """Lock for an individual weights file. - """ + """Lock for an individual weights file.""" def __init__(self, comm, source, target): """ @@ -254,12 +273,12 @@ def __init__(self, comm, source, target): self.comm = comm def lock(self): - #print{"Ack for lock '{}' lock from {}".format(self.locked_obj, self.target)) + # print{"Ack for lock '{}' lock from {}".format(self.locked_obj, self.target)) # send the acknowledgement of the lock back to target self.comm.send(MsgType.LOCKED, dest=self.target, tag=Tags.ACK) def unlock(self): - #print{"Ack for unlock '{}' lock from {}".format(self.locked_obj, self.target)) + # print{"Ack for unlock '{}' lock from {}".format(self.locked_obj, self.target)) self.comm.send(MsgType.UNLOCKED, dest=self.target, tag=Tags.ACK) @@ -336,7 +355,7 @@ def __init__(self, comm, outdir, exploiter, log_file, dataspaces=False): for i in range(self.comm.Get_size()): if i != self.rank: self.locks[i] = DataStoreLockManager(self.comm, self.rank) - self.scores[i] = {'score': float('nan')} + self.scores[i] = {"score": float("nan")} self.log_file = log_file self.all_scores = [] self.logs = [] @@ -353,11 +372,11 @@ def write_data(self): f = "{}/output.csv".format(self.outdir) header = self.all_scores[0].keys() if not os.path.isfile(f): - with open(f, 'w') as f_out: + with open(f, "w") as f_out: f_out.write(",".join(header)) f_out.write("\n") - with open(f, 'a') as f_out: + with open(f, "a") as f_out: for item in self.all_scores: for i, h in enumerate(header): if i > 0: @@ -372,7 +391,7 @@ def done(self): self.write_data() def write_logs(self): - with open(self.log_file, 'a') as f_out: + with open(self.log_file, "a") as f_out: for l in self.logs: f_out.write(l) f_out.write("\n") @@ -380,31 +399,31 @@ def write_logs(self): self.logs = [] def acquire_read_lock(self, requesting_rank, key): - #print("{} acquiring read lock for {}".format(requesting_rank, key)) + # print("{} acquiring read lock for {}".format(requesting_rank, key)) lock_manager = self.locks[key] lock_manager.read_lock(requesting_rank) def release_read_lock(self, requesting_rank, key): - #print("{} releasing read lock for {}".format(requesting_rank, key)) + # print("{} releasing read lock for {}".format(requesting_rank, key)) # can get NULL_RANK if score requested but no scores yet lock_manager = self.locks[key] lock_manager.read_unlock(requesting_rank) def acquire_write_lock(self, requesting_rank, key): - #print("{} acquiring write lock for {}".format(requesting_rank, key)) + # print("{} acquiring write lock for {}".format(requesting_rank, key)) lock_manager = self.locks[key] lock_manager.write_lock(requesting_rank) def release_write_lock(self, requesting_rank, key): - #print("{} releasing write lock for {}".format(requesting_rank, key)) + # print("{} releasing write lock for {}".format(requesting_rank, key)) lock_manager = self.locks[key] lock_manager.write_unlock(requesting_rank) def put_data(self, putting_rank, data): """ - :param :data - dictionary of data: val_loss etc. + :param :data - dictionary of data: val_loss etc. """ - #print("Putting score {},{}".format(putting_rank, data)) + # print("Putting score {},{}".format(putting_rank, data)) self.all_scores.append(data) live_ranks = self.comm.Get_size() - 1 @@ -414,54 +433,57 @@ def put_data(self, putting_rank, data): self.comm.send(MsgType.PUT_DATA, tag=Tags.ACK, dest=putting_rank) def get_data(self, score): - items = [x for x in self.scores.values() if not math.isnan(x['score'])] + items = [x for x in self.scores.values() if not math.isnan(x["score"])] result = self.exploiter(items, score) return result def run(self): t = time.localtime() start_time = time.time() - self.logs.append("PBT Start: {}".format(time.strftime('%Y-%m-%d %H:%M:%S', t))) + self.logs.append("PBT Start: {}".format( + time.strftime("%Y-%m-%d %H:%M:%S", t))) self.write_logs() - + status = MPI.Status() live_ranks = self.comm.Get_size() - 1 while live_ranks > 0: - msg = self.comm.recv(source=MPI.ANY_SOURCE, tag=Tags.REQUEST, status=status) + msg = self.comm.recv(source=MPI.ANY_SOURCE, + tag=Tags.REQUEST, + status=status) source = status.Get_source() - msg_type = msg['type'] + msg_type = msg["type"] if msg_type == MsgType.ACQUIRE_READ_LOCK: - msg_rank = msg['rank'] + msg_rank = msg["rank"] self.acquire_read_lock(source, msg_rank) elif msg_type == MsgType.RELEASE_READ_LOCK: - msg_rank = msg['rank'] + msg_rank = msg["rank"] self.release_read_lock(source, msg_rank) elif msg_type == MsgType.RELEASE_WRITE_LOCK: - msg_rank = msg['rank'] + msg_rank = msg["rank"] self.release_write_lock(source, msg_rank) elif msg_type == MsgType.PUT_DATA: - data = msg['data'] - lock_weights = msg['lock_weights'] + data = msg["data"] + lock_weights = msg["lock_weights"] self.put_data(source, data) if lock_weights: self.acquire_write_lock(source, source) elif msg_type == MsgType.GET_DATA: - score = msg['score'] + score = msg["score"] result = self.get_data(score) self.comm.send(result, dest=source, tag=Tags.SCORE) - lock_weights = msg['lock_weights'] + lock_weights = msg["lock_weights"] if len(result) and lock_weights: - rank_to_read = result['rank'] + rank_to_read = result["rank"] self.acquire_read_lock(source, rank_to_read) elif msg_type == MsgType.LOG: - log = msg['log'] + log = msg["log"] self.logs.append(log) if len(self.logs) > 20: self.write_logs() @@ -469,24 +491,23 @@ def run(self): elif msg_type == MsgType.DONE: live_ranks -= 1 - t = time.localtime() - self.logs.append("PBT End: {}".format(time.strftime('%Y-%m-%d %H:%M:%S', t))) + self.logs.append("PBT End: {}".format( + time.strftime("%Y-%m-%d %H:%M:%S", t))) self.logs.append("Duration: {}".format(time.time() - start_time)) self.done() print("Done") - + class PBTWorker: - """ PBTCallback uses classes that implement this API to determine - when a model is ready to exploit and explore, to retrieve metadata - and hyperparameters from the model to put in the shared PBTMetaDataStore, - and to perform the model specific exploit and explore update. - """ + """PBTCallback uses classes that implement this API to determine when a + model is ready to exploit and explore, to retrieve metadata and + hyperparameters from the model to put in the shared PBTMetaDataStore, and + to perform the model specific exploit and explore update.""" def ready(self, pbt_client, epoch, model): - """ Returns True if the model is ready for an exploit explore update. + """Returns True if the model is ready for an exploit explore update. :param pbt_client: A PBTClient instance that can be used for logging (i.e. pbt_client.log(msg)) @@ -496,10 +517,10 @@ def ready(self, pbt_client, epoch, model): pass def pack_data(self, pbt_client, model, metrics): - """ Packs relevant hyperparameters and selected score metric into a dict to be - passed to the PBTMetaDataStore. A typical implementation will select - one of the metrics (e.g. 'val_loss') from the keras provided metrics - and set that as the 'score' used to determine model peformance. + """Packs relevant hyperparameters and selected score metric into a dict + to be passed to the PBTMetaDataStore. A typical implementation will + select one of the metrics (e.g. 'val_loss') from the keras provided + metrics and set that as the 'score' used to determine model peformance. Any hyperparameters that are updated in an exploit / explore should also be included in the returned dictionary. For example, @@ -521,9 +542,9 @@ def pack_data(self, pbt_client, model, metrics): pass def update(self, epoch, pbt_client, model, data): - """ Updates the specified model by performing an exploit / explore - using the data in data. NOTE that the PBTCallback will load the - new weights into the model. That should NOT be done here. + """Updates the specified model by performing an exploit / explore using + the data in data. NOTE that the PBTCallback will load the new weights + into the model. That should NOT be done here. For example, assuming the pack_data method stores the learing rate as 'lr' and we want to update the specified model's lr to a perturbed @@ -547,6 +568,7 @@ def update(self, epoch, pbt_client, model, data): """ pass + import traceback @@ -570,7 +592,7 @@ class PBTCallback(keras.callbacks.Callback): PUT = 1 def __init__(self, comm, root_rank, outdir, pbt_worker, dataspaces=False): - """ Initializes this PBTCallback. + """Initializes this PBTCallback. :param comm: the MPI communicator in which this PBTCallback operates :param root_rank: the rank of the PBTMetaDataStore @@ -579,49 +601,55 @@ def __init__(self, comm, root_rank, outdir, pbt_worker, dataspaces=False): """ if dataspaces: raise ValueError("Dataspaces is not currently supported") - #self.client = DataSpacesPBTClient(comm, root_rank, outdir) + # self.client = DataSpacesPBTClient(comm, root_rank, outdir) else: self.client = PBTClient(comm, root_rank, outdir) self.outdir = outdir - #self.timer = Timer("{}/timings_{}.csv".format(self.outdir, self.client.rank)) + # self.timer = Timer("{}/timings_{}.csv".format(self.outdir, self.client.rank)) self.pbt_worker = pbt_worker def on_batch_end(self, batch, logs): pass def on_epoch_begin(self, epoch, logs): - + t = time.localtime() - self.client.log("Client {} Epoch {} Start: {}".format(self.client.rank, epoch, time.strftime('%Y-%m-%d %H:%M:%S', t))) - + self.client.log("Client {} Epoch {} Start: {}".format( + self.client.rank, epoch, time.strftime("%Y-%m-%d %H:%M:%S", t))) + self.epoch_start = time.time() def on_epoch_end(self, epoch, logs): - metrics = {'epoch': epoch, 'rank': self.client.rank, 'duration' : time.time() - self.epoch_start} - #print("Rank: {}, Epoch: {} end".format(self.client.rank, epoch)) + metrics = { + "epoch": epoch, + "rank": self.client.rank, + "duration": time.time() - self.epoch_start, + } + # print("Rank: {}, Epoch: {} end".format(self.client.rank, epoch)) metrics.update(logs) data = self.pbt_worker.pack_data(self.client, self.model, metrics) self.client.put(data, self.model) - #self.timer.end(PBTCallback.PUT) + # self.timer.end(PBTCallback.PUT) if self.pbt_worker.ready(self.client, self.model, epoch): - result = self.client.get_data(data['score']) + result = self.client.get_data(data["score"]) if len(result): - print("{},{} is ready - updating".format(epoch, self.client.rank)) - rank_to_read = result['rank'] + print("{},{} is ready - updating".format( + epoch, self.client.rank)) + rank_to_read = result["rank"] self.client.load_state(self.model, result, rank_to_read) # update after loading state as loading the state will set the state # of the optimizer etc. self.pbt_worker.update(epoch, self.client, self.model, result) print("{},{} updated".format(epoch, self.client.rank)) - #print("{} loading weights from {}".format(self.client.rank, rank)) - - #else: - # print("{},{} is ready - no update".format(epoch, self.client.rank)) - + # print("{} loading weights from {}".format(self.client.rank, rank)) + + # else: + # print("{},{} is ready - no update".format(epoch, self.client.rank)) def on_train_end(self, logs={}): t = time.localtime() - self.client.log("Client {} End: {}".format(self.client.rank, time.strftime('%Y-%m-%d %H:%M:%S', t))) + self.client.log("Client {} End: {}".format( + self.client.rank, time.strftime("%Y-%m-%d %H:%M:%S", t))) self.client.done() diff --git a/workflows/pbt/python/pbt_utils.py b/workflows/pbt/python/pbt_utils.py index be9965ab..9b4e703d 100644 --- a/workflows/pbt/python/pbt_utils.py +++ b/workflows/pbt/python/pbt_utils.py @@ -1,10 +1,12 @@ -from keras.engine import topology +import json + +import h5py +import keras.backend as K +import numpy as np from keras import optimizers +from keras.engine import topology from keras.models import Sequential -import keras.backend as K -import h5py, json -import numpy as np def get_json_type(obj): """Serialize any object to a JSON-serializable structure. @@ -20,9 +22,11 @@ def get_json_type(obj): """ # if obj is a serializable Keras class instance # e.g. optimizer, layer - if hasattr(obj, 'get_config'): - return {'class_name': obj.__class__.__name__, - 'config': obj.get_config()} + if hasattr(obj, "get_config"): + return { + "class_name": obj.__class__.__name__, + "config": obj.get_config() + } # if obj is any numpy type if type(obj).__module__ == np.__name__: @@ -36,142 +40,148 @@ def get_json_type(obj): if type(obj).__name__ == type.__name__: return obj.__name__ - raise TypeError('Not JSON Serializable:', obj) + raise TypeError("Not JSON Serializable:", obj) + def convert_custom_objects(obj, custom_objects={}): - """Handles custom object lookup. - - # Arguments - obj: object, dict, or list. - - # Returns - The same structure, where occurences - of a custom object name have been replaced - with the custom object. - """ - if isinstance(obj, list): - deserialized = [] - for value in obj: - if value in custom_objects: - deserialized.append(custom_objects[value]) - else: - deserialized.append(value) - return deserialized - if isinstance(obj, dict): - deserialized = {} - for key, value in obj.items(): - if value in custom_objects: - deserialized[key] = custom_objects[value] - else: - deserialized[key] = value - return deserialized - if obj in custom_objects: - return custom_objects[obj] - return obj + """Handles custom object lookup. + + # Arguments + obj: object, dict, or list. + + # Returns + The same structure, where occurences + of a custom object name have been replaced + with the custom object. + """ + if isinstance(obj, list): + deserialized = [] + for value in obj: + if value in custom_objects: + deserialized.append(custom_objects[value]) + else: + deserialized.append(value) + return deserialized + if isinstance(obj, dict): + deserialized = {} + for key, value in obj.items(): + if value in custom_objects: + deserialized[key] = custom_objects[value] + else: + deserialized[key] = value + return deserialized + if obj in custom_objects: + return custom_objects[obj] + return obj + def save_optimizer(model, hdf_file): # from save_model in keras.models.py - hdf_file.attrs['training_config'] = json.dumps({ - 'optimizer_config': { - 'class_name': model.optimizer.__class__.__name__, - 'config': model.optimizer.get_config() - }, - 'loss': model.loss, - 'metrics': model.metrics, - 'sample_weight_mode': model.sample_weight_mode, - 'loss_weights': model.loss_weights, - }, default=get_json_type).encode('utf8') + hdf_file.attrs["training_config"] = json.dumps( + { + "optimizer_config": { + "class_name": model.optimizer.__class__.__name__, + "config": model.optimizer.get_config(), + }, + "loss": model.loss, + "metrics": model.metrics, + "sample_weight_mode": model.sample_weight_mode, + "loss_weights": model.loss_weights, + }, + default=get_json_type, + ).encode("utf8") # Save optimizer weights. - symbolic_weights = getattr(model.optimizer, 'weights') + symbolic_weights = getattr(model.optimizer, "weights") if symbolic_weights: - optimizer_weights_group = hdf_file.create_group('optimizer_weights') + optimizer_weights_group = hdf_file.create_group("optimizer_weights") weight_values = K.batch_get_value(symbolic_weights) weight_names = [] for i, (w, val) in enumerate(zip(symbolic_weights, weight_values)): # Default values of symbolic_weights is /variable for theano - if K.backend() == 'theano': - if hasattr(w, 'name') and w.name != "/variable": + if K.backend() == "theano": + if hasattr(w, "name") and w.name != "/variable": name = str(w.name) else: - name = 'param_' + str(i) + name = "param_" + str(i) else: - if hasattr(w, 'name') and w.name: + if hasattr(w, "name") and w.name: name = str(w.name) else: - name = 'param_' + str(i) - weight_names.append(name.encode('utf8')) + name = "param_" + str(i) + weight_names.append(name.encode("utf8")) - optimizer_weights_group.attrs['weight_names'] = weight_names + optimizer_weights_group.attrs["weight_names"] = weight_names for name, val in zip(weight_names, weight_values): - param_dset = optimizer_weights_group.create_dataset( - name, - val.shape, - dtype=val.dtype) + param_dset = optimizer_weights_group.create_dataset(name, + val.shape, + dtype=val.dtype) if not val.shape: # scalar param_dset[()] = val else: param_dset[:] = val + def load_optimizer(model, hdf_file): - # instantiate optimizer - training_config = hdf_file.attrs.get('training_config') + # instantiate optimizer + training_config = hdf_file.attrs.get("training_config") - training_config = json.loads(training_config.decode('utf-8')) - optimizer_config = training_config['optimizer_config'] - optimizer = optimizers.deserialize(optimizer_config, - custom_objects={}) + training_config = json.loads(training_config.decode("utf-8")) + optimizer_config = training_config["optimizer_config"] + optimizer = optimizers.deserialize(optimizer_config, custom_objects={}) # Recover loss functions and metrics. - loss = convert_custom_objects(training_config['loss']) - metrics = convert_custom_objects(training_config['metrics']) - sample_weight_mode = training_config['sample_weight_mode'] - loss_weights = training_config['loss_weights'] + loss = convert_custom_objects(training_config["loss"]) + metrics = convert_custom_objects(training_config["metrics"]) + sample_weight_mode = training_config["sample_weight_mode"] + loss_weights = training_config["loss_weights"] # Compile model. - model.compile(optimizer=optimizer, - loss=loss, - metrics=metrics, - loss_weights=loss_weights, - sample_weight_mode=sample_weight_mode) + model.compile( + optimizer=optimizer, + loss=loss, + metrics=metrics, + loss_weights=loss_weights, + sample_weight_mode=sample_weight_mode, + ) # Set optimizer weights. - if 'optimizer_weights' in hdf_file: + if "optimizer_weights" in hdf_file: # Build train function (to get weight updates). if isinstance(model, Sequential): model.model._make_train_function() else: model._make_train_function() - optimizer_weights_group = hdf_file['optimizer_weights'] - optimizer_weight_names = [n.decode('utf8') for n in optimizer_weights_group.attrs['weight_names']] - optimizer_weight_values = [optimizer_weights_group[n] for n in optimizer_weight_names] + optimizer_weights_group = hdf_file["optimizer_weights"] + optimizer_weight_names = [ + n.decode("utf8") + for n in optimizer_weights_group.attrs["weight_names"] + ] + optimizer_weight_values = [ + optimizer_weights_group[n] for n in optimizer_weight_names + ] model.optimizer.set_weights(optimizer_weight_values) def save_state(model, outdir, rank): fname = "{}/weights_opt_{}.h5".format(outdir, rank) - with h5py.File(fname, 'w') as f: - model_weights_group = f.create_group('model_weights') + with h5py.File(fname, "w") as f: + model_weights_group = f.create_group("model_weights") topology.save_weights_to_hdf5_group(model_weights_group, model.layers) save_optimizer(model, f) f.flush() + def load_state(model, outdir, rank): fname = "{}/weights_opt_{}.h5".format(outdir, rank) - + # keras.engine.network.py, l. 1124+ - with h5py.File(fname, 'r') as f: - f = h5py.File(fname, mode='r') + with h5py.File(fname, "r") as f: + f = h5py.File(fname, mode="r") weights = f - if 'layer_names' not in f.attrs and 'model_weights' in f: - weights = f['model_weights'] - + if "layer_names" not in f.attrs and "model_weights" in f: + weights = f["model_weights"] + topology.load_weights_from_hdf5_group(weights, model.layers) load_optimizer(model, f) - - - - - - diff --git a/workflows/pbt/python/tc1_pbt.py b/workflows/pbt/python/tc1_pbt.py index 65f9c6f8..a7c56cb3 100644 --- a/workflows/pbt/python/tc1_pbt.py +++ b/workflows/pbt/python/tc1_pbt.py @@ -1,15 +1,18 @@ +import importlib +import math +import os +import random import sys -import importlib, time -from mpi4py import MPI -import os, random, math +import time import ga_utils import pbt - from keras import backend as K +from mpi4py import MPI class TC1PBTWorker: + def __init__(self, rank): self.rank = rank @@ -22,15 +25,15 @@ def ready(self, pbt_client, model, epoch): # return ready def pack_data(self, pbt_client, model, metrics): - """ Packs relevant hyperparameters and selected score metric into a dict to be - passed to the datastore. + """Packs relevant hyperparameters and selected score metric into a dict + to be passed to the datastore. - :param metrics: the metrics in keras callback log + :param metrics: the metrics in keras callback log """ lr = float(K.get_value(model.optimizer.lr)) - data = {'lr': lr, 'score': metrics['val_loss']} + data = {"lr": lr, "score": metrics["val_loss"]} data.update(metrics) - #pbt_client.log("{}: putting data".format(self.rank)) + # pbt_client.log("{}: putting data".format(self.rank)) return data def update(self, epoch, pbt_client, model, data): @@ -38,43 +41,44 @@ def update(self, epoch, pbt_client, model, data): # 'score': 0.36156702836354576, 'lr': 0.0010000000474974513, 'val_acc': 0.87870370237915607, # 'val_loss': 0.36156702836354576} # current_lr = float(K.get_value(model.optimizer.lr)) - lr = data['lr'] + lr = data["lr"] draw = random.random() - if draw < .5: + if draw < 0.5: lr = lr * 0.8 else: lr = lr * 1.2 K.set_value(model.optimizer.lr, lr) - #pbt_client.log("{},{},{},{},{}".format(self.rank, epoch, data['rank'], current_lr, lr)) - #pbt_client.log("{}: updating from rank {}, lr from {} to {}".format(self.rank, data['rank'], old_lr, lr)) + # pbt_client.log("{},{},{},{},{}".format(self.rank, epoch, data['rank'], current_lr, lr)) + # pbt_client.log("{}: updating from rank {}, lr from {} to {}".format(self.rank, data['rank'], old_lr, lr)) def truncation_select(data, score): """ - :param data: list of dict containg each ranks' model data as well as - rank itself. - :return a dict that contains all the selected rank's model data, or an - empty dict if no selection + :param data: list of dict containg each ranks' model data as well as + rank itself. + :return a dict that contains all the selected rank's model data, or an + empty dict if no selection """ # e.g. data: [{'acc': 0.87916666666666665, 'loss': 0.38366817765765721, 'rank': 1, # 'score': 0.36156702836354576, 'lr': 0.0010000000474974513, 'val_acc': 0.87870370237915607, # 'val_loss': 0.36156702836354576}, ...] - items = sorted(data, key=lambda item: item['score']) + items = sorted(data, key=lambda item: item["score"]) size = len(items) quintile = int(round(size / 5.0)) - if quintile > 0 and score >= items[-quintile]['score']: + if quintile > 0 and score >= items[-quintile]["score"]: # in bottom 20%, so select from top 20% if quintile == 1: idx = 0 else: idx = random.randint(0, quintile - 1) - #print("Returning: {}".format(items[idx])) + # print("Returning: {}".format(items[idx])) return items[idx] else: - #print("Returning nothing") + # print("Returning nothing") return {} + def init_params(params_file, comm): param_factories = ga_utils.create_parameters(params_file, True) params = [{}] @@ -86,6 +90,7 @@ def init_params(params_file, comm): return params + def run_model(comm, rank, hyper_parameter_map, args): exp_dir = args[2] instance_dir = "{}/run_{}/".format(exp_dir, rank) @@ -94,12 +99,12 @@ def run_model(comm, rank, hyper_parameter_map, args): model_name = args[3] - hyper_parameter_map['framework'] = 'keras' - hyper_parameter_map['save'] = '{}/output'.format(instance_dir) - hyper_parameter_map['instance_directory'] = instance_dir - hyper_parameter_map['model_name'] = model_name - hyper_parameter_map['experiment_id'] = args[4] - hyper_parameter_map['run_id'] = rank + hyper_parameter_map["framework"] = "keras" + hyper_parameter_map["save"] = "{}/output".format(instance_dir) + hyper_parameter_map["instance_directory"] = instance_dir + hyper_parameter_map["model_name"] = model_name + hyper_parameter_map["experiment_id"] = args[4] + hyper_parameter_map["run_id"] = rank runner = "{}_runner".format(model_name) sys.argv = [runner] @@ -108,7 +113,8 @@ def run_model(comm, rank, hyper_parameter_map, args): pbt_callback = pbt.PBTCallback(comm, 0, weights_dir, TC1PBTWorker(rank)) t = time.localtime() - pbt_callback.client.log("Client {} Start: {}".format(rank, time.strftime('%Y-%m-%d %H:%M:%S', t))) + pbt_callback.client.log("Client {} Start: {}".format( + rank, time.strftime("%Y-%m-%d %H:%M:%S", t))) try: pkg.run(hyper_parameter_map, [pbt_callback]) except: @@ -124,6 +130,7 @@ def init_dirs(outdir): if not os.path.exists(weights_dir): os.makedirs(weights_dir) + def main(args): comm = MPI.COMM_WORLD rank = comm.Get_rank() @@ -140,14 +147,15 @@ def main(args): init_dirs(outdir) comm.scatter(params, root=0) log_file = "{}/log.txt".format(outdir) - root = pbt.PBTMetaDataStore(comm, outdir, truncation_select, log_file) + root = pbt.PBTMetaDataStore(comm, outdir, truncation_select, + log_file) root.run() else: params = comm.scatter(None, root=0) if len(params) > 0: run_model(comm, rank, params, args) - #print("{}: {}".format(rank, params)) + # print("{}: {}".format(rank, params)) -if __name__ == '__main__': +if __name__ == "__main__": main(sys.argv) diff --git a/workflows/pbt/python/tc1_pbt_ds.py b/workflows/pbt/python/tc1_pbt_ds.py index 9e975938..9d6bd3be 100644 --- a/workflows/pbt/python/tc1_pbt_ds.py +++ b/workflows/pbt/python/tc1_pbt_ds.py @@ -1,14 +1,17 @@ -import sys import importlib -from mpi4py import MPI -import os, random, math +import math +import os +import random +import sys import ga_utils import pbt - from keras import backend as K +from mpi4py import MPI + class TC1PBTWorker: + def __init__(self, rank): self.rank = rank @@ -21,15 +24,15 @@ def ready(self, pbt_client, model, epoch): # return ready def pack_data(self, pbt_client, model, metrics): - """ Packs relevant hyperparameters and selected score metric into a dict to be - passed to the datastore. + """Packs relevant hyperparameters and selected score metric into a dict + to be passed to the datastore. - :param metrics: the metrics in keras callback log + :param metrics: the metrics in keras callback log """ lr = float(K.get_value(model.optimizer.lr)) - data = {'lr': lr, 'score': metrics['val_loss']} + data = {"lr": lr, "score": metrics["val_loss"]} data.update(metrics) - #pbt_client.log("{}: putting data".format(self.rank)) + # pbt_client.log("{}: putting data".format(self.rank)) return data def update(self, epoch, pbt_client, model, data): @@ -37,47 +40,48 @@ def update(self, epoch, pbt_client, model, data): # 'score': 0.36156702836354576, 'lr': 0.0010000000474974513, 'val_acc': 0.87870370237915607, # 'val_loss': 0.36156702836354576} current_lr = float(K.get_value(model.optimizer.lr)) - lr = data['lr'] + lr = data["lr"] draw = random.random() - if draw < .5: + if draw < 0.5: lr = lr * 0.8 else: lr = lr * 1.2 K.set_value(model.optimizer.lr, lr) - pbt_client.log("{},{},{},{},{}".format(self.rank, epoch, data['rank'], current_lr, lr)) - #pbt_client.log("{}: updating from rank {}, lr from {} to {}".format(self.rank, data['rank'], old_lr, lr)) + pbt_client.log("{},{},{},{},{}".format(self.rank, epoch, data["rank"], + current_lr, lr)) + # pbt_client.log("{}: updating from rank {}, lr from {} to {}".format(self.rank, data['rank'], old_lr, lr)) def truncation_select(data, score): """ - :param data: list of dict containg each ranks' model data as well as - rank itself. - :return a dict that contains all the selected rank's model data, or an - empty dict if no selection + :param data: list of dict containg each ranks' model data as well as + rank itself. + :return a dict that contains all the selected rank's model data, or an + empty dict if no selection """ # e.g. data: [{'acc': 0.87916666666666665, 'loss': 0.38366817765765721, 'rank': 1, # 'score': 0.36156702836354576, 'lr': 0.0010000000474974513, 'val_acc': 0.87870370237915607, # 'val_loss': 0.36156702836354576}, ...] - items = sorted(data, key=lambda item: item['score']) + items = sorted(data, key=lambda item: item["score"]) size = len(items) quintile = int(round(size / 5.0)) - if score >= items[-quintile]['score']: + if score >= items[-quintile]["score"]: # in bottom 20%, so select from top 20% idx = random.randint(0, quintile - 1) - #print("Returning: {}".format(items[idx])) + # print("Returning: {}".format(items[idx])) return items[idx] else: - #print("Returning nothing") + # print("Returning nothing") return {} + def random_select(data, score): - """ - Useful for testing to force a weight load. - """ + """Useful for testing to force a weight load.""" idx = random.randint(0, len(data) - 1) return data[idx] + def init_params(params_file, comm): param_factories = ga_utils.create_parameters(params_file, True) params = [{}] @@ -89,6 +93,7 @@ def init_params(params_file, comm): return params + def run_model(comm, rank, hyper_parameter_map, args): exp_dir = args[2] @@ -98,21 +103,25 @@ def run_model(comm, rank, hyper_parameter_map, args): model_name = args[3] - hyper_parameter_map['framework'] = 'keras' - hyper_parameter_map['save'] = '{}/output'.format(instance_dir) - hyper_parameter_map['instance_directory'] = instance_dir - hyper_parameter_map['model_name'] = model_name - hyper_parameter_map['experiment_id'] = args[4] - hyper_parameter_map['run_id'] = rank + hyper_parameter_map["framework"] = "keras" + hyper_parameter_map["save"] = "{}/output".format(instance_dir) + hyper_parameter_map["instance_directory"] = instance_dir + hyper_parameter_map["model_name"] = model_name + hyper_parameter_map["experiment_id"] = args[4] + hyper_parameter_map["run_id"] = rank runner = "{}_runner".format(model_name) sys.argv = [runner] pkg = importlib.import_module(runner) weights_dir = "{}/weights".format(exp_dir) - pbt_callback = pbt.PBTCallback(comm, 0, weights_dir, TC1PBTWorker(rank), - dataspaces=True) + pbt_callback = pbt.PBTCallback(comm, + 0, + weights_dir, + TC1PBTWorker(rank), + dataspaces=True) pkg.run(hyper_parameter_map, [pbt_callback]) + def init_dirs(outdir): if not os.path.exists(outdir): os.makedirs(outdir) @@ -121,6 +130,7 @@ def init_dirs(outdir): if not os.path.exists(weights_dir): os.makedirs(weights_dir) + def main(args): comm = MPI.COMM_WORLD rank = comm.Get_rank() @@ -131,15 +141,18 @@ def main(args): init_dirs(outdir) comm.scatter(params, root=0) log_file = "{}/log.txt".format(outdir) - root = pbt.PBTMetaDataStore(comm, outdir, random_select, log_file, - dataspaces=True) + root = pbt.PBTMetaDataStore(comm, + outdir, + random_select, + log_file, + dataspaces=True) root.run() else: params = comm.scatter(None, root=0) run_model(comm, rank, params, args) - #print("{}: {}".format(rank, params)) + # print("{}: {}".format(rank, params)) -if __name__ == '__main__': +if __name__ == "__main__": main(sys.argv) diff --git a/workflows/pbt/python/test/pbt_tests.py b/workflows/pbt/python/test/pbt_tests.py index 79ef4291..68d512c4 100644 --- a/workflows/pbt/python/test/pbt_tests.py +++ b/workflows/pbt/python/test/pbt_tests.py @@ -1,47 +1,50 @@ from __future__ import print_function + import unittest -import tc1_pbt -import pbt_utils -import numpy as np -import keras -from keras.optimizers import Adam +import keras +import numpy as np +import pbt_utils +import tc1_pbt from keras import backend as K +from keras.optimizers import Adam + class TestPBT(unittest.TestCase): def testTruncate(self): data = [] for i in range(0, 11): - data.append({'score': 11 - i, 'rank': i}) + data.append({"score": 11 - i, "rank": i}) - #print(data) + # print(data) for i in range(0, 10): result = tc1_pbt.truncation_select(data, i) self.assertEqual(0, len(result)) for i in range(10, 12): - result = tc1_pbt.truncation_select(data, i) + result = tc1_pbt.truncation_select(data, i) self.assertTrue(len(result) > 0) - score = result['score'] - rank = result['rank'] + score = result["score"] + rank = result["rank"] self.assertTrue(rank == 9 or rank == 10) self.assertTrue(score < 3) + class TestIO(unittest.TestCase): def create_model(self, lr): X, y = np.random.rand(100, 50), np.random.randint(2, size=100) x = keras.layers.Input((50,)) - out = keras.layers.Dense(1, activation='sigmoid')(x) + out = keras.layers.Dense(1, activation="sigmoid")(x) model = keras.models.Model(x, out) optimizer = Adam(lr=lr) - model.compile(optimizer=optimizer, loss='binary_crossentropy') + model.compile(optimizer=optimizer, loss="binary_crossentropy") model.fit(X, y, epochs=5) return model def testIO(self): - model = self.create_model(.0001) + model = self.create_model(0.0001) lr = float(K.get_value(model.optimizer.lr)) self.assertAlmostEqual(0.0001, lr) weights = model.get_weights() @@ -59,5 +62,5 @@ def testIO(self): self.assertTrue(np.array_equal(weights[0], model.get_weights()[0])) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/workflows/pbt/scripts/common.m4 b/workflows/pbt/scripts/common.m4 index 040627e0..a3dc922d 100644 --- a/workflows/pbt/scripts/common.m4 +++ b/workflows/pbt/scripts/common.m4 @@ -4,4 +4,4 @@ divert(`-1') changecom(`dnl') define(`getenv', `esyscmd(printf -- "$`$1' ")') define(`getenv_nospace', `esyscmd(printf -- "$`$1'")') -divert \ No newline at end of file +divert diff --git a/workflows/pbt/scripts/local.cfg b/workflows/pbt/scripts/local.cfg index 174ee761..b8694d9a 100644 --- a/workflows/pbt/scripts/local.cfg +++ b/workflows/pbt/scripts/local.cfg @@ -1,3 +1,3 @@ export PROCS=6 export PPN=1 -export EXP_DIR=../experiments/$EXP_ID \ No newline at end of file +export EXP_DIR=../experiments/$EXP_ID diff --git a/workflows/pbt/scripts/local_submit.cfg b/workflows/pbt/scripts/local_submit.cfg index 22dd29df..dac8dfa2 100644 --- a/workflows/pbt/scripts/local_submit.cfg +++ b/workflows/pbt/scripts/local_submit.cfg @@ -7,4 +7,4 @@ PYTHONPATH+=":$ROOT/models/tc1" export PYTHONPATH=$PYTHONPATH -CMD="mpirun -n $PROCS python -u $PBT_PY $PARAMS_FILE $EXP_DIR tc1 $EXP_ID" \ No newline at end of file +CMD="mpirun -n $PROCS python -u $PBT_PY $PARAMS_FILE $EXP_DIR tc1 $EXP_ID" diff --git a/workflows/pbt/scripts/local_submit.m4 b/workflows/pbt/scripts/local_submit.m4 index e1b26d28..c76e0e47 100644 --- a/workflows/pbt/scripts/local_submit.m4 +++ b/workflows/pbt/scripts/local_submit.m4 @@ -1 +1 @@ -# EMPTY \ No newline at end of file +# EMPTY diff --git a/workflows/pbt/scripts/pbt_run.sh b/workflows/pbt/scripts/pbt_run.sh index ef53d3bf..343d09eb 100755 --- a/workflows/pbt/scripts/pbt_run.sh +++ b/workflows/pbt/scripts/pbt_run.sh @@ -50,4 +50,3 @@ source "$SITE"_submit.cfg #echo $CMD $CMD - diff --git a/workflows/pbt/scripts/theta.cfg b/workflows/pbt/scripts/theta.cfg index 0864aa97..aac2a607 100644 --- a/workflows/pbt/scripts/theta.cfg +++ b/workflows/pbt/scripts/theta.cfg @@ -2,4 +2,4 @@ export PROCS=128 export PPN=1 export WALLTIME=01:00:00 export PROJECT=CSC249ADOA01 -export QUEUE=default \ No newline at end of file +export QUEUE=default diff --git a/workflows/pbt/scripts/theta_submit.cfg b/workflows/pbt/scripts/theta_submit.cfg index 4598b840..8583aaea 100644 --- a/workflows/pbt/scripts/theta_submit.cfg +++ b/workflows/pbt/scripts/theta_submit.cfg @@ -6,4 +6,4 @@ PP=${PP//:/\\:} EXPORTS="ROOT=$ROOT:PBT_PY=$PBT_PY:BENCHMARKS=$BENCHMARKS:PP=$PP" EXPORTS+=":SUPERVISOR=$SUPERVISOR:EXP_ID=$EXP_ID:PARAMS_FILE=$P_FILE:EXP_DIR=$EXP_DIR" -CMD="qsub --env $EXPORTS --jobname=$EXP_ID --mode script $SH" \ No newline at end of file +CMD="qsub --env $EXPORTS --jobname=$EXP_ID --mode script $SH" diff --git a/workflows/pbt/scripts/theta_submit.m4 b/workflows/pbt/scripts/theta_submit.m4 index 9c5fe1d6..a65856ae 100644 --- a/workflows/pbt/scripts/theta_submit.m4 +++ b/workflows/pbt/scripts/theta_submit.m4 @@ -4,7 +4,7 @@ ifelse(getenv_nospace(PROJECT), `',,#COBALT -A getenv_nospace(PROJECT) )#COBALT -n getenv(NODES) #COBALT -t getenv(WALLTIME) #COBALT -o getenv_nospace(EXP_DIR)/output.txt -#COBALT -e getenv_nospace(EXP_DIR)/output.txt +#COBALT -e getenv_nospace(EXP_DIR)/output.txt #COBALT --cwd getenv(EXP_DIR) export PYTHONPATH=$PP:$PYTHONPATH diff --git a/workflows/pbt/scripts/titan.cfg b/workflows/pbt/scripts/titan.cfg index f0d42843..902c8f97 100644 --- a/workflows/pbt/scripts/titan.cfg +++ b/workflows/pbt/scripts/titan.cfg @@ -4,4 +4,4 @@ export PPN=1 export WALLTIME=00:40:00 export PROJECT=MED106 export QUEUE=batch -export EXP_DIR=$PROJWORK/csc249/ncollier/experiments/$EXP_ID \ No newline at end of file +export EXP_DIR=$PROJWORK/csc249/ncollier/experiments/$EXP_ID diff --git a/workflows/pbt/scripts/titan_submit.cfg b/workflows/pbt/scripts/titan_submit.cfg index 02c082da..9e3cd8f1 100644 --- a/workflows/pbt/scripts/titan_submit.cfg +++ b/workflows/pbt/scripts/titan_submit.cfg @@ -1,4 +1,4 @@ EXPORTS="ROOT=$ROOT,PBT_PY=$PBT_PY,BENCHMARKS=$BENCHMARKS,PP=$PP" EXPORTS+=",SUPERVISOR=$SUPERVISOR,EXP_ID=$EXP_ID,PARAMS_FILE=$P_FILE,EXP_DIR=$EXP_DIR" -export CMD="qsub -v $EXPORTS -d $EXP_DIR -N $EXP_ID $SH" \ No newline at end of file +export CMD="qsub -v $EXPORTS -d $EXP_DIR -N $EXP_ID $SH" diff --git a/workflows/pbt/src/Readme.md b/workflows/pbt/src/Readme.md index 143bad42..4179658c 100644 --- a/workflows/pbt/src/Readme.md +++ b/workflows/pbt/src/Readme.md @@ -1,4 +1,4 @@ -# PBT src folder # +# PBT src folder This folder contains experimental c source code for using Dataspaces (http://personal.cac.rutgers.edu/TASSL/projects/data/index.html) as the PBT datastore, i.e. where the weights and model metadata are stored for querying by other models. diff --git a/workflows/random/README.md b/workflows/random/README.md index ccc8411c..b79fb7b0 100644 --- a/workflows/random/README.md +++ b/workflows/random/README.md @@ -1,5 +1,7 @@ # Simple parameter sweep with Swift -> parameters randomly chosen between specified bounds. + The main program (random-sweep.swift) calls a few app functions as follows: + - determineParameters.{sh,py}: Read data/ **settings.json** for sweep parameters, and return as a string for use by Swift program - evaluateOne.{sh,py}: Runs a single experiment. (Calls p1b1_runner). - computeStats.{sh,py}: Ingests data from all of the experiments and computes simple stats. @@ -7,19 +9,22 @@ The main program (random-sweep.swift) calls a few app functions as follows: Usage: ./run experient_1 Notes: -- **settings.json**: -A. parameters (benchmark parameters) -===================================== -1: epochs + +- **settings.json**: + A. parameters (benchmark parameters) + ===================================== + 1: epochs + 2. batch_size 3. N1 4. NE -B. samples (specifies the number of random samples to prepare) -=============================================================== +# B. samples (specifies the number of random samples to prepare) + 1. num For adding new parameters: + 1. Add to the json file the desired parameters 2. Read params in determineParameters.py: def loadSettings(settingsFilename): -3. Modify the evaluateOne.py file (set to run on keras framework now) \ No newline at end of file +3. Modify the evaluateOne.py file (set to run on keras framework now) diff --git a/workflows/random/data/settings.json b/workflows/random/data/settings.json index c0561733..4aca314d 100644 --- a/workflows/random/data/settings.json +++ b/workflows/random/data/settings.json @@ -1,15 +1,13 @@ { - "parameters": - { - "epochs": [4, 8 ], - "batch_size": [30, 40], - "N1": [1500, 1500], - "NE": [600, 600], - "latent_dim": [2, 64], - "learning_rate": [0.00001, 0.1] - }, - "samples": - { - "num": [120] - } + "parameters": { + "epochs": [4, 8], + "batch_size": [30, 40], + "N1": [1500, 1500], + "NE": [600, 600], + "latent_dim": [2, 64], + "learning_rate": [0.00001, 0.1] + }, + "samples": { + "num": [120] + } } diff --git a/workflows/random/python/computeStats.py b/workflows/random/python/computeStats.py index 69704a31..4e33ee1b 100644 --- a/workflows/random/python/computeStats.py +++ b/workflows/random/python/computeStats.py @@ -1,34 +1,41 @@ +import json +import os import sys from collections import defaultdict -import json, os + def extractVals(A): B = defaultdict(dict) A1 = A.split() for n, val in zip(A1[0::2], A1[1::2]): B[n] = float(val) - return(B) + return B + def computeStats(swiftArrayAsString): A = extractVals(swiftArrayAsString) vals = [] for a in A: vals += [A[a]] - print('%d values, with min=%f, max=%f, avg=%f\n'%(len(vals),min(vals),max(vals),sum(vals)/float(len(vals)))) - - filename = os.environ['TURBINE_OUTPUT']+ "/final_stats.txt" + print("%d values, with min=%f, max=%f, avg=%f\n" % + (len(vals), min(vals), max(vals), sum(vals) / float(len(vals)))) + + filename = os.environ["TURBINE_OUTPUT"] + "/final_stats.txt" # writing the val loss to the output file - with open(filename, 'w') as the_file: - the_file.write('%d values, with min=%f, max=%f, avg=%f\n'%(len(vals),min(vals),max(vals),sum(vals)/float(len(vals)))) + with open(filename, "w") as the_file: + the_file.write( + "%d values, with min=%f, max=%f, avg=%f\n" % + (len(vals), min(vals), max(vals), sum(vals) / float(len(vals)))) + -if (len(sys.argv) < 2): - print('requires arg=dataFilename') - sys.exit(1) +if len(sys.argv) < 2: + print("requires arg=dataFilename") + sys.exit(1) dataFilename = sys.argv[1] try: - with open(dataFilename, 'r') as the_file: + with open(dataFilename, "r") as the_file: data = the_file.read() except IOError as e: print("Could not open: %s" % dataFilename) diff --git a/workflows/random/python/determineParameters.py b/workflows/random/python/determineParameters.py index 0fc2a51c..0acd5daf 100644 --- a/workflows/random/python/determineParameters.py +++ b/workflows/random/python/determineParameters.py @@ -1,8 +1,11 @@ -import sys, json, os +import json +import os +import sys from random import randint, uniform # ===== Definitions ========================================================= + def loadSettings(settingsFilename): print("Reading settings: %s" % settingsFilename) try: @@ -13,51 +16,54 @@ def loadSettings(settingsFilename): print("PWD is: '%s'" % os.getcwd()) sys.exit(1) try: - epochs = settings['parameters']["epochs"] - batch_size = settings['parameters']["batch_size"] - N1 = settings['parameters']["N1"] - NE = settings['parameters']["NE"] - latent_dim = settings['parameters']["latent_dim"] - learning_rate = settings['parameters']["learning_rate"] - + epochs = settings["parameters"]["epochs"] + batch_size = settings["parameters"]["batch_size"] + N1 = settings["parameters"]["N1"] + NE = settings["parameters"]["NE"] + latent_dim = settings["parameters"]["latent_dim"] + learning_rate = settings["parameters"]["learning_rate"] except KeyError as e: - print("Settings file (%s) does not contain key: %s" % (settingsFilename, str(e))) + print("Settings file (%s) does not contain key: %s" % + (settingsFilename, str(e))) sys.exit(1) try: - samples = settings['samples']["num"] + samples = settings["samples"]["num"] except KeyError as e: - print("Settings file (%s) does not contain key: %s" % (settingsFilename, str(e))) + print("Settings file (%s) does not contain key: %s" % + (settingsFilename, str(e))) sys.exit(1) - return(epochs, batch_size, N1, NE, latent_dim, learning_rate, samples) + return (epochs, batch_size, N1, NE, latent_dim, learning_rate, samples) + # ===== Main program ======================================================== -if (len(sys.argv) < 3): - print('requires arg1=settingsFilename and arg2=paramsFilename') - sys.exit(1) +if len(sys.argv) < 3: + print("requires arg1=settingsFilename and arg2=paramsFilename") + sys.exit(1) settingsFilename = sys.argv[1] -paramsFilename = sys.argv[2] +paramsFilename = sys.argv[2] -print (settingsFilename) -print (paramsFilename) +print(settingsFilename) +print(paramsFilename) -epochs, batch_size, N1, NE, latent_dim, learning_rate, samples = loadSettings(settingsFilename) -result="" +epochs, batch_size, N1, NE, latent_dim, learning_rate, samples = loadSettings( + settingsFilename) +result = "" # select '#samples' random numbers between the range provided in settings.json file for s in range(samples[0]): - t_epoch= randint(epochs[0], epochs[1]) - t_batch_size= randint(batch_size[0], batch_size[1]) - t_N1= randint(N1[0], N1[1]) - t_NE= randint(NE[0], NE[1]) - t_ld= randint(latent_dim[0], latent_dim[1]) - t_lr= uniform(learning_rate[0], learning_rate[1]) - result+=str(t_epoch) + ',' + str(t_batch_size) + ',' + str(t_N1) + ',' + str(t_NE) + ',' + str(t_ld)+ ',' + str(t_lr) - if(s < (samples[0]-1)): - result+=":" - -with open(paramsFilename, 'w') as the_file: - the_file.write(result) + t_epoch = randint(epochs[0], epochs[1]) + t_batch_size = randint(batch_size[0], batch_size[1]) + t_N1 = randint(N1[0], N1[1]) + t_NE = randint(NE[0], NE[1]) + t_ld = randint(latent_dim[0], latent_dim[1]) + t_lr = uniform(learning_rate[0], learning_rate[1]) + result += (str(t_epoch) + "," + str(t_batch_size) + "," + str(t_N1) + "," + + str(t_NE) + "," + str(t_ld) + "," + str(t_lr)) + if s < (samples[0] - 1): + result += ":" +with open(paramsFilename, "w") as the_file: + the_file.write(result) diff --git a/workflows/random/python/evaluateOne.py b/workflows/random/python/evaluateOne.py index 00910697..3b823eb6 100644 --- a/workflows/random/python/evaluateOne.py +++ b/workflows/random/python/evaluateOne.py @@ -1,48 +1,52 @@ +import json +import os +import socket import sys + import p1b1_runner -import json, os -import socket -if (len(sys.argv) < 3): - print('requires arg1=param and arg2=filename') - sys.exit(1) +if len(sys.argv) < 3: + print("requires arg1=param and arg2=filename") + sys.exit(1) parameterString = sys.argv[1] -filename = sys.argv[2] +filename = sys.argv[2] # print (parameterString) -print ("filename is " + filename) -print (socket.gethostname()) - -#List of hyperparameters - edit this to add or remove a parameter -epochs, batch_size, d1, d2, ld, lr = parameterString.split(',') - -hyper_parameter_map = {'epochs' : int(epochs)} -hyper_parameter_map['framework'] = 'keras' -hyper_parameter_map['batch_size'] = int(batch_size) -hyper_parameter_map['dense'] = [int(d1), int(d2)] -hyper_parameter_map['latent_dim'] = int(ld) -hyper_parameter_map['learning_rate'] = float(lr) - -hyper_parameter_map['run_id'] = parameterString -# hyper_parameter_map['instance_directory'] = os.environ['TURBINE_OUTPUT'] -hyper_parameter_map['save'] = os.environ['TURBINE_OUTPUT']+ "/output-"+os.environ['PMI_RANK'] -sys.argv = ['p1b1_runner'] +print("filename is " + filename) +print(socket.gethostname()) + +# List of hyperparameters - edit this to add or remove a parameter +epochs, batch_size, d1, d2, ld, lr = parameterString.split(",") + +hyper_parameter_map = {"epochs": int(epochs)} +hyper_parameter_map["framework"] = "keras" +hyper_parameter_map["batch_size"] = int(batch_size) +hyper_parameter_map["dense"] = [int(d1), int(d2)] +hyper_parameter_map["latent_dim"] = int(ld) +hyper_parameter_map["learning_rate"] = float(lr) + +hyper_parameter_map["run_id"] = parameterString +# hyper_parameter_map['instance_directory'] = os.environ['TURBINE_OUTPUT'] +hyper_parameter_map["save"] = (os.environ["TURBINE_OUTPUT"] + "/output-" + + os.environ["PMI_RANK"]) +sys.argv = ["p1b1_runner"] val_loss = p1b1_runner.run(hyper_parameter_map) -print (val_loss) +print(val_loss) -sfn = os.environ['TURBINE_OUTPUT']+ "/output-"+os.environ['PMI_RANK'] + "/procname-" + parameterString -with open(sfn, 'w') as sfile: +sfn = (os.environ["TURBINE_OUTPUT"] + "/output-" + os.environ["PMI_RANK"] + + "/procname-" + parameterString) +with open(sfn, "w") as sfile: sfile.write(socket.getfqdn()) - proc_id = "-"+ str(os.getpid()) + proc_id = "-" + str(os.getpid()) sfile.write(proc_id) # works around this error: # https://github.com/tensorflow/tensorflow/issues/3388 from keras import backend as K + K.clear_session() # writing the val loss to the output file (result-*) -with open(filename, 'w') as the_file: +with open(filename, "w") as the_file: the_file.write(repr(val_loss)) - diff --git a/workflows/random/python/p1b1_runner.py b/workflows/random/python/p1b1_runner.py index 7ceb0c59..ddb43b10 100644 --- a/workflows/random/python/p1b1_runner.py +++ b/workflows/random/python/p1b1_runner.py @@ -1,24 +1,30 @@ # tensoflow.__init__ calls _os.path.basename(_sys.argv[0]) # so we need to create a synthetic argv. import sys -if not hasattr(sys, 'argv'): - sys.argv = ['p1b1'] + +if not hasattr(sys, "argv"): + sys.argv = ["p1b1"] import json import os + import p1b1 import runner_utils + def run(hyper_parameter_map): - framework = hyper_parameter_map['framework'] - if framework is 'keras': + framework = hyper_parameter_map["framework"] + if framework is "keras": import p1b1_baseline_keras2 + pkg = p1b1_baseline_keras2 - elif framework is 'mxnet': + elif framework is "mxnet": import p1b1_baseline_mxnet + pkg = p1b1_baseline_mxnet - elif framework is 'neon': + elif framework is "neon": import p1b1_baseline_neon + pkg = p1b1_baseline_neon else: raise ValueError("Invalid framework: {}".format(framework)) @@ -27,23 +33,24 @@ def run(hyper_parameter_map): params = pkg.initialize_parameters() runner_utils.format_params(hyper_parameter_map) - for k,v in hyper_parameter_map.items(): - #if not k in params: + for k, v in hyper_parameter_map.items(): + # if not k in params: # raise Exception("Parameter '{}' not found in set of valid arguments".format(k)) params[k] = v print(params) history = pkg.run(params) - if framework is 'keras': + if framework is "keras": # works around this error: # https://github.com/tensorflow/tensorflow/issues/3388 try: from keras import backend as K + K.clear_session() - except AttributeError: # theano does not have this function + except AttributeError: # theano does not have this function pass # use the last validation_loss as the value to minimize - val_loss = history.history['val_loss'] + val_loss = history.history["val_loss"] return val_loss[-1] diff --git a/workflows/random/python/test/run_test_p1b1.sh b/workflows/random/python/test/run_test_p1b1.sh index 65e4c62a..14e4964d 100755 --- a/workflows/random/python/test/run_test_p1b1.sh +++ b/workflows/random/python/test/run_test_p1b1.sh @@ -4,4 +4,4 @@ P1B1_DIR=../../../../../Benchmarks/Pilot1/P1B1 export PYTHONPATH="$PWD/..:$P1B1_DIR:../../../common/python" echo $PYTHONPATH -python test_p1b1.py \ No newline at end of file +python test_p1b1.py diff --git a/workflows/random/python/test/test_p1b1.py b/workflows/random/python/test/test_p1b1.py index 192de79b..8c0cdd9e 100644 --- a/workflows/random/python/test/test_p1b1.py +++ b/workflows/random/python/test/test_p1b1.py @@ -1,14 +1,17 @@ import p1b1_runner + def main(): - hyper_parameter_map = {'epochs' : 1} - hyper_parameter_map['batch_size'] = 40 - hyper_parameter_map['dense'] = [1900, 500] - hyper_parameter_map['framework'] = 'keras' - hyper_parameter_map['save'] = './p1bl1_output' + hyper_parameter_map = {"epochs": 1} + hyper_parameter_map["batch_size"] = 40 + hyper_parameter_map["dense"] = [1900, 500] + hyper_parameter_map["framework"] = "keras" + hyper_parameter_map["save"] = "./p1bl1_output" validation_loss = p1b1_runner.run(hyper_parameter_map) print("Validation Loss: ", validation_loss) -if __name__ == '__main__': + + +if __name__ == "__main__": main() diff --git a/workflows/random/swift/cooley_workflow.sh b/workflows/random/swift/cooley_workflow.sh index 50cd66ab..ddd2dfee 100755 --- a/workflows/random/swift/cooley_workflow.sh +++ b/workflows/random/swift/cooley_workflow.sh @@ -1,6 +1,6 @@ #!/bin/bash # -# Usage: ./run +# Usage: ./run # if [ "$#" -ne 1 ]; then @@ -76,4 +76,3 @@ set -x export TURBINE_LOG=1 echo swift-t -l -n $PROCS $MACHINE -p $ENVS $EMEWS_PROJECT_ROOT/random-sweep.swift $* --settings=$PWD/../data/settings.json swift-t -l -n $PROCS $MACHINE -p $ENVS $EMEWS_PROJECT_ROOT/random-sweep.swift $* --settings=$PWD/../data/settings.json - diff --git a/workflows/random/swift/workflow.sh b/workflows/random/swift/workflow.sh index 08bcef8d..1ea94308 100755 --- a/workflows/random/swift/workflow.sh +++ b/workflows/random/swift/workflow.sh @@ -62,7 +62,7 @@ CMD_LINE_ARGS=( -param_set_file=$PARAM_SET_FILE # settings.json file has all the parameter combinations to be tested #echo swift-t -l -n $PROCS $EMEWS_PROJECT_ROOT/random-sweep.swift $* -#swift-t -l -n $PROCS $EMEWS_PROJECT_ROOT/random-sweep.swift $* --settings=$PWD/../data/settings.json +#swift-t -l -n $PROCS $EMEWS_PROJECT_ROOT/random-sweep.swift $* --settings=$PWD/../data/settings.json @@ -86,5 +86,3 @@ swift-t -n $PROCS \ $( python_envs ) \ -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ $EMEWS_PROJECT_ROOT/swift/random-sweep.swift ${CMD_LINE_ARGS[@]} - - diff --git a/workflows/random/test/cfg-prm-1.sh b/workflows/random/test/cfg-prm-1.sh index 3db2ca4a..e60e7613 100644 --- a/workflows/random/test/cfg-prm-1.sh +++ b/workflows/random/test/cfg-prm-1.sh @@ -11,4 +11,3 @@ MAX_BUDGET=${MAX_BUDGET:-1800} DESIGN_SIZE=${DESIGN_SIZE:-2} PARAM_SET_FILE=${PARAM_SET_FILE:-$EMEWS_PROJECT_ROOT/data/parameter_set.R} MODEL_NAME="p1b1" - diff --git a/workflows/random/test/cfg-sys-1.sh b/workflows/random/test/cfg-sys-1.sh index 6e48105f..b0afa605 100644 --- a/workflows/random/test/cfg-sys-1.sh +++ b/workflows/random/test/cfg-sys-1.sh @@ -18,4 +18,3 @@ export WALLTIME=${WALLTIME:-01:33:00} # Benchmark run timeout: benchmark run will timeouT # after the specified number of seconds. -1 is no timeout. BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} - diff --git a/workflows/random/test/test.sh b/workflows/random/test/test.sh index 5cd5b3d1..57bfd2aa 100755 --- a/workflows/random/test/test.sh +++ b/workflows/random/test/test.sh @@ -35,4 +35,3 @@ SCRIPT=$( basename $0 .sh ) check_output "learning_rate" $OUTPUT $WORKFLOW $SCRIPT $JOBID echo "$SCRIPT: SUCCESS" - diff --git a/workflows/test-horovod/make.sh b/workflows/test-horovod/make.sh index 6d416ec9..5463b7f1 100755 --- a/workflows/test-horovod/make.sh +++ b/workflows/test-horovod/make.sh @@ -22,7 +22,7 @@ make # mpicc -c -fPIC $TCL_INCLUDE_SPEC -I$CONTROLLER horovod_wrap.c # mpicc -shared -o libhorovod.so horovod_wrap.o $CONTROLLER/controller.o \ -# -l python2.7 +# -l python2.7 # tclsh make-package.tcl > pkgIndex.tcl # stc -r $PWD test-horovod.swift diff --git a/workflows/test-horovod/test.py b/workflows/test-horovod/test.py index 44e1c1c2..9702a536 100644 --- a/workflows/test-horovod/test.py +++ b/workflows/test-horovod/test.py @@ -1,7 +1,7 @@ - # This is the test Horovod program print("Importing...") -import keras import horovod.keras as hvd +import keras + print("Imported.") diff --git a/workflows/upf/README.md b/workflows/upf/README.md index ccbe4267..42346872 100644 --- a/workflows/upf/README.md +++ b/workflows/upf/README.md @@ -1,47 +1,48 @@ -# Evaluate an Unrolled Parameter File (UPF) # +# Evaluate an Unrolled Parameter File (UPF) This workflow evaluates ensembles of "Benchmark" available here: `git@github.com:ECP-CANDLE/Benchmarks.git` for a given set of parameters. -## Running ## +## Running -1. cd into the *~/Supervisor/workflows/upf/test* directory -2. Specify the MODEL_NAME in *upf-1.sh* file, hyperparameters in *upf-1.txt* -3. Specify the #procs, queue etc. in *cfg-sys-1.sh* file -4. Launch the test by invoking *./upf-1.sh * - where machine_name can be cori, theta, titan etc. +1. cd into the _~/Supervisor/workflows/upf/test_ directory +2. Specify the MODEL*NAME in \_upf-1.sh* file, hyperparameters in _upf-1.txt_ +3. Specify the #procs, queue etc. in _cfg-sys-1.sh_ file +4. Launch the test by invoking _./upf-1.sh _ + where machine_name can be cori, theta, titan etc. 5. The benchmark will be run for the number of processors specified 6. Final objective function value will be available in the experiments directory and also printed - -## User requirements ## +## User requirements What you need to install to run the workflow: -* This workflow - `git@github.com:ECP-CANDLE/Supervisor.git` . +- This workflow - `git@github.com:ECP-CANDLE/Supervisor.git` . Clone and `cd` to `workflows/nt3_mlrMBO` (the directory containing this README). -* NT3 benchmark - `git@github.com:ECP-CANDLE/Benchmarks.git` . +- NT3 benchmark - `git@github.com:ECP-CANDLE/Benchmarks.git` . Clone and switch to the `frameworks` branch. -* benchmark data - - See the individual benchmarks README for obtaining the initial data -* Swift/T with the recently implemented JSON module, +- benchmark data - + See the individual benchmarks README for obtaining the initial data +- Swift/T with the recently implemented JSON module, cf. https://github.com/swift-lang/swift-t/issues/121 -## Calling sequence ## +## Calling sequence Script call stack :- -* upf-1.sh -> swift/workflow.sh -> swift/workflow.swift -> -common/swift/obj_app.swift -> common/sh/model.sh -> -common/python/model_runner.py -> 'calls the benchmark' + +- upf-1.sh -> swift/workflow.sh -> swift/workflow.swift -> + common/swift/obj_app.swift -> common/sh/model.sh -> + common/python/model_runner.py -> 'calls the benchmark' Scheduling scripts :- -* upf-1.sh -> cfg-sys-1.sh -> common/sh/ - module, scheduling, langs .sh files -## Infer workflow ## +- upf-1.sh -> cfg-sys-1.sh -> common/sh/ - module, scheduling, langs .sh files + +## Infer workflow This workflow assumes you have a data directory (called, say, DATA) containing run directories for processing with the new infer.py script -### Quick start ### +### Quick start ``` $ cd workflows/upf/test @@ -57,8 +58,8 @@ $ ./mk-infer-upf.sh upf-DATA.txt /path/to/DATA/uq.{40..100} $ ./upf-infer.sh cori upf-DATA.txt ``` -### File index ### +### File index -* mk-infer-upf.sh: Assembles the JSON fragments into the UPF -* infer-template.json: M4 template for JSON fragments. Populated by environment variables set in mk-infer-upf.sh -* swift/workflow.{sh,swift}: Normal UPF workflow but newly extracts id from JSON template. The id is used as the run output directory +- mk-infer-upf.sh: Assembles the JSON fragments into the UPF +- infer-template.json: M4 template for JSON fragments. Populated by environment variables set in mk-infer-upf.sh +- swift/workflow.{sh,swift}: Normal UPF workflow but newly extracts id from JSON template. The id is used as the run output directory diff --git a/workflows/upf/swift/workflow.sh b/workflows/upf/swift/workflow.sh index bdfc0f9e..7d7d4c1b 100755 --- a/workflows/upf/swift/workflow.sh +++ b/workflows/upf/swift/workflow.sh @@ -4,19 +4,14 @@ set -eu # UPF WORKFLOW SH # Autodetect this workflow directory -export EMEWS_PROJECT_ROOT=$( cd $( dirname $0 )/.. ; /bin/pwd ) -export WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. ; /bin/pwd ) +export EMEWS_PROJECT_ROOT=$( realpath $( dirname $0 )/.. ) +export WORKFLOWS_ROOT=$( realpath $EMEWS_PROJECT_ROOT/.. ) -export BENCHMARKS_ROOT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd) -BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/NT3:$BENCHMARKS_ROOT/Pilot2/P2B1:$BENCHMARKS_ROOT/Pilot1/P1B1:$BENCHMARKS_ROOT/Pilot1/Combo:$BENCHMARKS_ROOT/Pilot3/P3B1:$BENCHMARKS_ROOT/Pilot3/P3B3:$BENCHMARKS_ROOT/Pilot3/P3B4:$BENCHMARKS_ROOT/Pilot3/P3B5 -export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} SCRIPT_NAME=$(basename $0) # Source some utility functions used by EMEWS in this script source $WORKFLOWS_ROOT/common/sh/utils.sh -export TURBINE_LOG=0 TURBINE_DEBUG=0 ADLB_DEBUG=0 - usage() { echo "UNROLLED PARAMETER FILE: usage: workflow.sh SITE EXPID CFG_SYS UPF" @@ -29,9 +24,13 @@ then fi if ! { - get_site $1 # Sets SITE - get_expid $2 # Sets EXPID, TURBINE_OUTPUT - get_cfg_sys $3 + # Sets SITE + # Sets EXPID, TURBINE_OUTPUT + # Sets CFG_SYS + # UPF is the JSON hyperparameter file + get_site $1 && \ + get_expid $2 && \ + get_cfg_sys $3 && \ UPF=$4 } then @@ -39,21 +38,15 @@ then exit 1 fi -# Set PYTHONPATH for BENCHMARK related stuff -PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common -PYTHONPATH+=:$WORKFLOWS_ROOT/common/python - source_site env $SITE -source_site sched $SITE +source_site sched $SITE -log_path PYTHONPATH +# Set up PYTHONPATH for model +source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh -if [[ ${EQR:-} == "" ]] -then - abort "The site '$SITE' did not set the location of EQ/R: this will not work!" -fi +log_path PYTHONPATH -export TURBINE_JOBNAME="JOB:${EXPID}" +export TURBINE_JOBNAME="${EXPID}" OBJ_PARAM_ARG="" if [[ ${OBJ_PARAM:-} != "" ]] @@ -61,14 +54,12 @@ then OBJ_PARAM_ARG="--obj_param=$OBJ_PARAM" fi -# Andrew: Allows for custom model.sh if desired export MODEL_SH=${MODEL_SH:-$WORKFLOWS_ROOT/common/sh/model.sh} export BENCHMARK_TIMEOUT CMD_LINE_ARGS=( -expid=$EXPID -benchmark_timeout=$BENCHMARK_TIMEOUT - -f=$UPF # ALW: keeping it as $UPF to allow $UPF to be a full path - #-f=$TURBINE_OUTPUT/$UPF # Copied to TURBINE_OUTPUT below + -f=$UPF ) USER_VARS=( $CMD_LINE_ARGS ) @@ -81,36 +72,45 @@ cp $CFG_SYS $TURBINE_OUTPUT # Make run directory in advance to reduce contention mkdir -pv $TURBINE_OUTPUT/run -which mpicc -which swift-t +cp -v $UPF $TURBINE_OUTPUT -module list +# TURBINE_STDOUT="$TURBINE_OUTPUT/out-%%r.txt" +TURBINE_STDOUT= -cp -v $UPF $TURBINE_OUTPUT +if [[ ${CANDLE_DATA_DIR:-} == "" ]] +then + abort "upf/workflow.sh: Set CANDLE_DATA_DIR!" +fi + +export CANDLE_IMAGE=${CANDLE_IMAGE:-} -TURBINE_STDOUT="$TURBINE_OUTPUT/out-%%r.txt" +which swift-t swift-t -n $PROCS \ -o $TURBINE_OUTPUT/workflow.tic \ ${MACHINE:-} \ - -p -I $EQR -r $EQR \ + -p \ -I $WORKFLOWS_ROOT/common/swift \ - -i obj_$SWIFT_IMPL \ - -e LD_LIBRARY_PATH=$LD_LIBRARY_PATH \ + -i model_$CANDLE_MODEL_IMPL \ -e BENCHMARKS_ROOT \ -e EMEWS_PROJECT_ROOT \ -e MODEL_SH \ + -e FI_MR_CACHE_MAX_COUNT=0 \ -e SITE \ -e BENCHMARK_TIMEOUT \ - -e MODEL_NAME \ + -e MODEL_NAME=${MODEL_NAME:-MODEL_NULL} \ -e OBJ_RETURN \ -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ - -e TURBINE_MPI_THREAD=1 \ + -e TURBINE_MPI_THREAD=${TURBINE_MPI_THREAD:-1} \ $( python_envs ) \ -e TURBINE_STDOUT=$TURBINE_STDOUT \ - -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ - -e PYTHONUNBUFFERED=1 \ + -e CANDLE_MODEL_TYPE \ + -e CANDLE_IMAGE \ $EMEWS_PROJECT_ROOT/swift/workflow.swift ${CMD_LINE_ARGS[@]} +# Can provide this to debug Python settings: # -e PYTHONVERBOSE=1 -# -e PATH=$PATH +# Can provide this if needed for debugging crashes: +# -e PYTHONUNBUFFERED=1 +# Can provide this if needed to reset PATH: +# -e PATH=$PATH diff --git a/workflows/upf/swift/workflow.swift b/workflows/upf/swift/workflow.swift index 98afcb1a..1b57d605 100644 --- a/workflows/upf/swift/workflow.swift +++ b/workflows/upf/swift/workflow.swift @@ -17,11 +17,11 @@ report_env(); string FRAMEWORK = "keras"; // Scan command line -file upf = input(argv("f")); -int benchmark_timeout = toint(argv("benchmark_timeout", "-1")); +file upf = input(argv("f")); +int benchmark_timeout = string2int(argv("benchmark_timeout", "-1")); string model_name = getenv("MODEL_NAME"); -string exp_id = getenv("EXPID"); +string expid = getenv("EXPID"); string turbine_output = getenv("TURBINE_OUTPUT"); // Report some key facts: @@ -38,14 +38,12 @@ string results[]; foreach params,i in upf_lines { printf("params: %s", params); - id = json_get(params, "id"); - // NOTE: obj() is in the obj_*.swift supplied by workflow.sh - // id = "id_%02i"%i; - results[i] = obj(params, id); - assert(results[i] != "EXCEPTION", "exception in obj()!"); + runid = json_get(params, "id"); + results[i] = candle_model_train(params, expid, runid, model_name); + assert(results[i] != "EXCEPTION", "exception in candle_model_train()!"); } // Join all result values into one big semicolon-delimited string string result = join(results, ";"); // and print it -printf(result); +printf("WORKFLOW RESULT: " + result); diff --git a/workflows/upf/test/cfg-sys-1.sh b/workflows/upf/test/cfg-sys-1.sh index c8180e15..16eda96c 100644 --- a/workflows/upf/test/cfg-sys-1.sh +++ b/workflows/upf/test/cfg-sys-1.sh @@ -1,6 +1,9 @@ # UPF CFG SYS 1 +# Use 1 for interactive workflows +# export INTERACTIVE=1 + # The number of MPI processes # Note that 1 process is reserved for Swift/T # For example, if PROCS=4 that gives you 3 workers, @@ -9,7 +12,7 @@ export PROCS=${PROCS:-2} # MPI processes per node. This should not exceed PROCS. # Cori has 32 cores per node, 128GB per node -export PPN=${PPN:-1} +export PPN=${PPN:-2} #export QUEUE=${QUEUE:-batch} @@ -23,16 +26,18 @@ export PPN=${PPN:-1} # Theta: (cf. sched-theta) # export QUEUE=${QUEUE:-debug-cache-quad} -export QUEUE=${QUEUE:-debug-flat-quad} +# export QUEUE=${QUEUE:-debug-flat-quad} # export PROJECT=${PROJECT:-ecp-testbed-01} # export PROJECT=Candle_ECP -export PROJECT=CSC249ADOA01 +# export PROJECT=CSC249ADOA01 # Summit: -export QUEUE=${QUEUE:-batch} -export PROJECT=med106 +# export QUEUE=${QUEUE:-batch} +# export PROJECT=med106 +# export TURBINE_LAUNCH_OPTIONS="-a1 -g6 -c7" -export WALLTIME=${WALLTIME:-0:30} +export WALLTIME=${WALLTIME:-1:00:00} +echo WALLTIME: $WALLTIME # export MAIL_ENABLED=1 # export MAIL_ADDRESS=wozniak@mcs.anl.gov diff --git a/workflows/upf/test/cfg-sys-demo-1.sh b/workflows/upf/test/cfg-sys-demo-1.sh new file mode 100644 index 00000000..e4788c04 --- /dev/null +++ b/workflows/upf/test/cfg-sys-demo-1.sh @@ -0,0 +1,23 @@ + +# CFG SYS DEMO 1 + + +# The number of MPI processes +# Note that 1 process is reserved for Swift/T +# For example, if PROCS=4 that gives you 3 workers, +# i.e., 3 concurrent Keras runs. +export PROCS=${PROCS:-6} + +# MPI processes per node. This should not exceed PROCS. +export PPN=${PPN:-6} + +# Summit: +export QUEUE=${QUEUE:-batch} +export PROJECT=med106 +export TURBINE_LAUNCH_OPTIONS="-a1 -g1 -c7" + +# export WALLTIME=${WALLTIME:-0:30} + +# Benchmark run timeout: benchmark run will timeouT +# after the specified number of seconds. -1 is no timeout. +BENCHMARK_TIMEOUT=${BENCHMARK_TIMEOUT:-3600} diff --git a/workflows/upf/test/demo-1.json b/workflows/upf/test/demo-1.json new file mode 100644 index 00000000..fa511783 --- /dev/null +++ b/workflows/upf/test/demo-1.json @@ -0,0 +1,3 @@ +{"id": "test1", "epochs": 3, "dense": "200 15"} +{"id": "test2", "epochs": 3, "dense": "200 20"} +{"id": "test3", "epochs": 3, "dense": "200 25"} diff --git a/workflows/upf/test/demo-sweep-1.sh b/workflows/upf/test/demo-sweep-1.sh new file mode 100755 index 00000000..5eab8d82 --- /dev/null +++ b/workflows/upf/test/demo-sweep-1.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -eu + +# DEMO SWEEP 1 + +if (( ${#} != 2 )) +then + echo "usage: test BENCHMARK_NAME SITE" + exit 1 +fi + +export MODEL_NAME=$1 +SITE=$2 + +# Self-configure +THIS=$( cd $( dirname $0 ) ; /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. ; /bin/pwd ) +WORKFLOWS_ROOT=$( cd $THIS/../.. ; /bin/pwd ) +export EMEWS_PROJECT_ROOT + +export OBJ_RETURN="val_loss" +CFG_SYS=$THIS/cfg-sys-1.sh + +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $THIS/demo-1.json diff --git a/workflows/upf/test/upf-1.sh b/workflows/upf/test/upf-1.sh index fefbe00d..23e05541 100755 --- a/workflows/upf/test/upf-1.sh +++ b/workflows/upf/test/upf-1.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash set -eu # TEST UPF 1 @@ -21,6 +21,5 @@ export EMEWS_PROJECT_ROOT export OBJ_RETURN="val_loss" CFG_SYS=$THIS/cfg-sys-1.sh -export TURBINE_LAUNCH_OPTIONS="-a1 -g6 -c7" - +export CANDLE_MODEL_TYPE="BENCHMARKS" $EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $THIS/upf-1.txt diff --git a/workflows/upf/test/upf-1.txt b/workflows/upf/test/upf-1.txt index 36c4667d..919b1d07 100644 --- a/workflows/upf/test/upf-1.txt +++ b/workflows/upf/test/upf-1.txt @@ -1 +1,3 @@ -{"id": "test0", "epochs": 3} +{"id": "RUN000", "epochs": 1} +{"id": "RUN001", "epochs": 2} +{"id": "RUN002", "epochs": 3} diff --git a/workflows/upf/test/upf-gdrp-1.sh b/workflows/upf/test/upf-gdrp-1.sh new file mode 100755 index 00000000..b696d6f1 --- /dev/null +++ b/workflows/upf/test/upf-gdrp-1.sh @@ -0,0 +1,28 @@ +#!/bin/bash +set -eu + +# TEST UPF GDRP 1 +# For GraphDRP + +if (( ${#} != 1 )) +then + echo "usage: test SITE" + exit 1 +fi + +SITE=$1 + +# Self-configure +THIS=$( cd $( dirname $0 ) ; /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. ; /bin/pwd ) +WORKFLOWS_ROOT=$( cd $THIS/../.. ; /bin/pwd ) +export EMEWS_PROJECT_ROOT + +export OBJ_RETURN="val_loss" +CFG_SYS=$THIS/cfg-sys-1.sh + +export CANDLE_IMAGE=/lus/grand/projects/CSC249ADOA01/images/GraphDRP.sif +export CANDLE_MODEL_TYPE="SINGULARITY" +export MODEL_NAME="GraphDRP" + +$EMEWS_PROJECT_ROOT/swift/workflow.sh $SITE -a $CFG_SYS $THIS/upf-1.txt diff --git a/workflows/upf/test/upf-infer-orig.txt b/workflows/upf/test/upf-infer-orig.txt index f5ea51d6..792c12c3 100644 --- a/workflows/upf/test/upf-infer-orig.txt +++ b/workflows/upf/test/upf-infer-orig.txt @@ -1,3 +1,2 @@ {"model_file": "/global/homes/b/brettin/m2924/brettin/washington/uq.1/save/combo.A=relu.B=32.E=50.O=adam.LR=None.CF=r.DF=d.wu_lr.re_lr.res.D1=1000.D2=1000.D3=1000.D4=1000.model.h5", "weights_file": "/global/homes/b/brettin/m2924/brettin/washington/uq.1/save/combo.A=relu.B=32.E=50.O=adam.LR=None.CF=r.DF=d.wu_lr.re_lr.res.D1=1000.D2=1000.D3=1000.D4=1000.weights.h5", "drug_set": "ALMANAC", "sample_set": "GDSC"} {"model_file": "/global/homes/b/brettin/m2924/brettin/washington/uq.2/save/combo.A=relu.B=32.E=50.O=adam.LR=None.CF=r.DF=d.wu_lr.re_lr.res.D1=1000.D2=1000.D3=1000.D4=1000.model.h5", "weights_file": "/global/homes/b/brettin/m2924/brettin/washington/uq.2/save/combo.A=relu.B=32.E=50.O=adam.LR=None.CF=r.DF=d.wu_lr.re_lr.res.D1=1000.D2=1000.D3=1000.D4=1000.weights.h5", "drug_set": "ALMANAC", "sample_set": "GDSC"} - diff --git a/workflows/uq-noise/good-runs.txt b/workflows/uq-noise/good-runs.txt deleted file mode 100644 index 128d4020..00000000 --- a/workflows/uq-noise/good-runs.txt +++ /dev/null @@ -1,8 +0,0 @@ -Dunedin: -X053 : epochs=10 : modifying wrong DF -X056 : epochs=10 : good, small run -Theta: -* : wrong DF -X012 : epochs=7 : good data but flat plot -X013 : epochs=10 -X021 : epochs=10 : DONE diff --git a/workflows/uq-noise/scripts/plot-extract-logs.py b/workflows/uq-noise/scripts/plot-extract-logs.py index 990f8159..c4145f55 100755 --- a/workflows/uq-noise/scripts/plot-extract-logs.py +++ b/workflows/uq-noise/scripts/plot-extract-logs.py @@ -1,39 +1,44 @@ #!/usr/bin/env python -import os, sys +import argparse +import os +import sys from pprint import pprint -import argparse parser = argparse.ArgumentParser() -parser.add_argument('output', - help='The workflow output directory ' + - '(input to this script)') -parser.add_argument('obj_return', - help='The key to look for in the model.logs, ' + - 'e.g., val_loss or val_acc') -parser.add_argument('data', - help='The extracted data ' + - '(output from this script)') +parser.add_argument("output", + help="The workflow output directory " + + "(input to this script)") +parser.add_argument( + "obj_return", + help="The key to look for in the model.logs, " + + "e.g., val_loss or val_acc", +) +parser.add_argument("data", + help="The extracted data " + "(output from this script)") # print(sys.argv) args = parser.parse_args(sys.argv[1:]) values = {} + def dict_append(D, key, value): if key not in values.keys(): D[key] = [] D[key].append(value) + def tokenize(line): - results = [ token for token in line.split(" ") - if len(token) > 0 ] + results = [token for token in line.split(" ") if len(token) > 0] return results + def is_final_report(line): - return ("/step" in line) + return "/step" in line + def parse_model_log(f, obj_return): - target = obj_return+":" + target = obj_return + ":" with open(f) as fp: for line in fp: tokens = tokenize(line) @@ -47,14 +52,15 @@ def parse_model_log(f, obj_return): if not is_final_report(line): continue tokens = tokenize(line) - for i in range(0, len(tokens)-1): + for i in range(0, len(tokens) - 1): if tokens[i] == target: - value = float(tokens[i+1]) - break # 1 level + value = float(tokens[i + 1]) + break # 1 level if value == "NOTFOUND": print("NOTFOUND " + f) return (noise_level, value) + for d in os.walk(args.output): if "model.log" not in d[2]: continue @@ -71,4 +77,4 @@ def parse_model_log(f, obj_return): # print("noise=%i count=%i", noise, count) # print(values[noise]) s = sum(values[noise]) - fp.write("%8.4f %8.4f # count=%i\n" % (noise, s/count, count)) + fp.write("%8.4f %8.4f # count=%i\n" % (noise, s / count, count)) diff --git a/workflows/uq-noise/scripts/plot-extract.py b/workflows/uq-noise/scripts/plot-extract.py index fd14336f..3abc10de 100755 --- a/workflows/uq-noise/scripts/plot-extract.py +++ b/workflows/uq-noise/scripts/plot-extract.py @@ -1,15 +1,16 @@ #!/usr/bin/env python -import os, sys +import os +import sys print(sys.argv) import argparse + parser = argparse.ArgumentParser() -parser.add_argument('output', - help='The workflow output file (input to this script)') -parser.add_argument('data', - help='The extracted data (output from this script)') +parser.add_argument("output", + help="The workflow output file (input to this script)") +parser.add_argument("data", help="The extracted data (output from this script)") args = parser.parse_args(sys.argv[1:]) values = {} @@ -17,8 +18,7 @@ with open(args.output) as fp: for line in fp: tokens = line.split(" ") - if tokens[0] == 'result' and \ - tokens[2] == ":": + if tokens[0] == "result" and tokens[2] == ":": noise = float(tokens[4]) value = float(tokens[6]) if noise not in values.keys(): @@ -32,4 +32,4 @@ for noise in noises: n = len(values[noise]) s = sum(values[noise]) - fp.write("%8.4f %8.4f\n" % (noise, s/n)) + fp.write("%8.4f %8.4f\n" % (noise, s / n)) diff --git a/workflows/uq-noise/swift/workflow-abstention.sh b/workflows/uq-noise/swift/workflow-abstention.sh new file mode 100755 index 00000000..b31c4205 --- /dev/null +++ b/workflows/uq-noise/swift/workflow-abstention.sh @@ -0,0 +1,195 @@ +#! /usr/bin/env bash +set -eu + +# UQ NOISE WORKFLOW +# Main entry point for UQ-NOISE workflow +# See README.adoc for more information + +# Autodetect this workflow directory +export EMEWS_PROJECT_ROOT=$( cd $( dirname $0 )/.. ; /bin/pwd ) +export WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. ; /bin/pwd ) +if [[ ! -d $EMEWS_PROJECT_ROOT/../../../Benchmarks ]] +then + echo "Could not find Benchmarks in: $EMEWS_PROJECT_ROOT/../../../Benchmarks" + exit 1 +fi +BENCHMARKS_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd) +export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} +BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/NT3 +export BENCHMARK_TIMEOUT +export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} + +XCORR_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../xcorr ; /bin/pwd) +export XCORR_ROOT=${XCORR_ROOT:-$XCORR_DEFAULT} + +SCRIPT_NAME=$(basename $0) + +# Source some utility functions used by EMEWS in this script +source $WORKFLOWS_ROOT/common/sh/utils.sh + +usage() +{ + echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME" +} + +if (( ${#} != 5 )) +then + usage + exit 1 +fi + +if ! { + get_site $1 # Sets SITE + get_expid $2 # Sets EXPID + get_cfg_sys $3 + get_cfg_prm $4 + MODEL_NAME=$5 + } +then + usage + exit 1 +fi + +echo "Running "$MODEL_NAME "workflow" + +source_site env $SITE +source_site sched $SITE + +# Set PYTHONPATH for BENCHMARK related stuff +PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$XCORR_ROOT +PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # needed for model_runner and logs + +export APP_PYTHONPATH=$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$XCORR_ROOT + +export TURBINE_JOBNAME="JOB:${EXPID}" + +if [ -z ${GPU_STRING+x} ]; +then + GPU_ARG="" +else + GPU_ARG="-gpus=$GPU_STRING" +fi + +mkdir -pv $TURBINE_OUTPUT + +DB_FILE=$TURBINE_OUTPUT/uq-noise.db +if [[ ! -f DB_FILE ]] +then + if [[ ${UQ_NOISE_ID:-} == "" ]] + then + if [[ ${EXPID:0:1} == "X" ]] + then + UQ_NOISE_ID=${EXPID:1} + else + UQ_NOISE_ID=$EXPID + fi + fi + # $EMEWS_PROJECT_ROOT/db/db-cplo-init $DB_FILE $UQ_NOISE_ID +fi + +CMD_LINE_ARGS=( -benchmark_timeout=$BENCHMARK_TIMEOUT + -exp_id=$EXPID + -site=$SITE + -db_file=$DB_FILE + $GPU_ARG + -cache_dir=$CACHE_DIR + ) + +USER_VARS=( $CMD_LINE_ARGS ) +# log variables and script to to TURBINE_OUTPUT directory +log_script + +# Make run directory in advance to reduce contention +mkdir -pv $TURBINE_OUTPUT/run +mkdir -pv $TURBINE_OUTPUT/data +mkdir -pv $CACHE_DIR +mkdir -pv $XCORR_DATA_DIR +mkdir -pv $TURBINE_OUTPUT/hpo_log + +# Allow the user to set an objective function +OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} +CANDLE_MODEL_IMPL="py" +OBJ_MODULE=${OBJ_MODULE:-obj_abstention_$CANDLE_MODEL_IMPL} +# This is used by the obj_app objective function +export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model_abstention.sh + +# log_path PYTHONPATH + +WORKFLOW_SWIFT=${WORKFLOW_SWIFT:-workflow-abstention.swift} +echo "WORKFLOW_SWIFT: $WORKFLOW_SWIFT" + +WAIT_ARG="" +if (( ${WAIT:-0} )) +then + WAIT_ARG="-t w" + echo "Turbine will wait for job completion." +fi + +if [[ ${MACHINE:-} == "" ]] +then + STDOUT=$TURBINE_OUTPUT/output.txt + # The turbine-output link is only created on scheduled systems, + # so if running locally, we create it here so the test*.sh wrappers + # can find it + [[ -L turbine-output ]] && rm turbine-output + ln -s $TURBINE_OUTPUT turbine-output +else + # When running on a scheduled system, Swift/T automatically redirects + # stdout to the turbine-output directory. This will just be for + # warnings or unusual messages + # use for summit (slurm needs two %) + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" + + #export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%r.txt" + mkdir -pv $TURBINE_OUTPUT/out + STDOUT="" +fi + +#echo ${CMD_LINE_ARGS[@]} + +cd $TURBINE_OUTPUT +cp $CFG_SYS $CFG_PRM $WORKFLOWS_ROOT/uq-noise/swift/workflow-noise.swift $TURBINE_OUTPUT + +if [[ ${SITE} == "summit" ]] +then + export TURBINE_LAUNCH_OPTIONS="-r6 -a1 -g1 -c7" +fi +TURBINE_RESIDENT_WORK_WORKERS=1 + +swift-t -n $PROCS \ + ${MACHINE:-} \ + -p \ + -I $OBJ_DIR \ + -i $OBJ_MODULE \ + -e LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} \ + -e TURBINE_RESIDENT_WORK_WORKERS=$TURBINE_RESIDENT_WORK_WORKERS \ + -e TURBINE_STDOUT \ + -e RESIDENT_WORK_RANKS=$RESIDENT_WORK_RANKS \ + -e BENCHMARKS_ROOT \ + -e EMEWS_PROJECT_ROOT \ + -e XCORR_ROOT \ + -e APP_PYTHONPATH=$APP_PYTHONPATH \ + $( python_envs ) \ + -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ + -e OBJ_RETURN \ + -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ + -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ + -e MODEL_SH \ + -e MODEL_NAME \ + -e SITE \ + -e BENCHMARK_TIMEOUT \ + -e BENCHMARKS_ROOT \ + -e SH_TIMEOUT \ + -e IGNORE_ERRORS \ + $WAIT_ARG \ + $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} |& \ + tee $STDOUT + + +if (( ${PIPESTATUS[0]} )) +then + echo "workflow.sh: swift-t exited with error!" + exit 1 +fi + +# echo "EXIT CODE: 0" | tee -a $STDOUT diff --git a/workflows/uq-noise/swift/workflow-abstention.swift b/workflows/uq-noise/swift/workflow-abstention.swift new file mode 100644 index 00000000..676aec61 --- /dev/null +++ b/workflows/uq-noise/swift/workflow-abstention.swift @@ -0,0 +1,62 @@ +/* + UQ NOISE SWIFT + Main workflow +*/ + +import assert; +import files; +import io; +import python; +import unix; +import sys; +import string; +import location; +import math; +import json; + +string FRAMEWORK = "keras"; + +string xcorr_root = getenv("XCORR_ROOT"); +string preprocess_rnaseq = getenv("PREPROP_RNASEQ"); +string emews_root = getenv("EMEWS_PROJECT_ROOT"); +string turbine_output = getenv("TURBINE_OUTPUT"); + +string exp_id = argv("exp_id"); +int benchmark_timeout = toint(argv("benchmark_timeout", "-1")); +string model_name = getenv("MODEL_NAME"); + +printf("UQ NOISE WORKFLOW.SWIFT"); +printf("TURBINE_OUTPUT: " + turbine_output); + +float std_dev_step = 0.05; // Difference between noises +int num_trials = 2; + +float num_std_dev_noise= 20; // Number of noise levels to try + +float std_dev_array[] = [0:num_std_dev_noise]; +int trials[] = [0:num_trials-1]; + +int feature_col = 50; +float feature_threshold = 0.01; +string add_noise = "false"; +string noise_correlated = "true"; + +foreach level, i in std_dev_array +{ + foreach trial, k in trials + { + std_dev = level * std_dev_step; + run_id = "%0.2f-%01i" % (std_dev, k); + params = ("{ \"label_noise\" : %f , " + + " \"max_abs\" : %f, " + + " \"noise_correlated\" : %s, " + + " \"feature_col\" : %i, " + + " \"feature_threshold\" : %f, " + + " \"epochs\" : 100 } ") % + (std_dev, std_dev, noise_correlated, feature_col, feature_threshold); + printf("running: %s", params); + result = obj(params, run_id); + printf("result %s : std_dev %0.2f : %s", + run_id, std_dev, result); + } +} diff --git a/workflows/uq-noise/swift/workflow-gauss-abs.sh b/workflows/uq-noise/swift/workflow-gauss-abs.sh new file mode 100755 index 00000000..9d1b51df --- /dev/null +++ b/workflows/uq-noise/swift/workflow-gauss-abs.sh @@ -0,0 +1,195 @@ +#! /usr/bin/env bash +set -eu + +# UQ NOISE WORKFLOW +# Main entry point for UQ-NOISE workflow +# See README.adoc for more information + +# Autodetect this workflow directory +export EMEWS_PROJECT_ROOT=$( cd $( dirname $0 )/.. ; /bin/pwd ) +export WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. ; /bin/pwd ) +if [[ ! -d $EMEWS_PROJECT_ROOT/../../../Benchmarks ]] +then + echo "Could not find Benchmarks in: $EMEWS_PROJECT_ROOT/../../../Benchmarks" + exit 1 +fi +BENCHMARKS_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd) +export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} +BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/NT3 +export BENCHMARK_TIMEOUT +export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} + +XCORR_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../xcorr ; /bin/pwd) +export XCORR_ROOT=${XCORR_ROOT:-$XCORR_DEFAULT} + +SCRIPT_NAME=$(basename $0) + +# Source some utility functions used by EMEWS in this script +source $WORKFLOWS_ROOT/common/sh/utils.sh + +usage() +{ + echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME" +} + +if (( ${#} != 5 )) +then + usage + exit 1 +fi + +if ! { + get_site $1 # Sets SITE + get_expid $2 # Sets EXPID + get_cfg_sys $3 + get_cfg_prm $4 + MODEL_NAME=$5 + } +then + usage + exit 1 +fi + +echo "Running "$MODEL_NAME "workflow" + +source_site env $SITE +source_site sched $SITE + +# Set PYTHONPATH for BENCHMARK related stuff +PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$XCORR_ROOT +PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # needed for model_runner and logs + +export APP_PYTHONPATH=$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$XCORR_ROOT + +export TURBINE_JOBNAME="JOB:${EXPID}" + +if [ -z ${GPU_STRING+x} ]; +then + GPU_ARG="" +else + GPU_ARG="-gpus=$GPU_STRING" +fi + +mkdir -pv $TURBINE_OUTPUT + +DB_FILE=$TURBINE_OUTPUT/uq-noise.db +if [[ ! -f DB_FILE ]] +then + if [[ ${UQ_NOISE_ID:-} == "" ]] + then + if [[ ${EXPID:0:1} == "X" ]] + then + UQ_NOISE_ID=${EXPID:1} + else + UQ_NOISE_ID=$EXPID + fi + fi + # $EMEWS_PROJECT_ROOT/db/db-cplo-init $DB_FILE $UQ_NOISE_ID +fi + +CMD_LINE_ARGS=( -benchmark_timeout=$BENCHMARK_TIMEOUT + -exp_id=$EXPID + -site=$SITE + -db_file=$DB_FILE + $GPU_ARG + -cache_dir=$CACHE_DIR + ) + +USER_VARS=( $CMD_LINE_ARGS ) +# log variables and script to to TURBINE_OUTPUT directory +log_script + +# Make run directory in advance to reduce contention +mkdir -pv $TURBINE_OUTPUT/run +mkdir -pv $TURBINE_OUTPUT/data +mkdir -pv $CACHE_DIR +mkdir -pv $XCORR_DATA_DIR +mkdir -pv $TURBINE_OUTPUT/hpo_log + +# Allow the user to set an objective function +OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} +CANDLE_MODEL_IMPL="py" +OBJ_MODULE=${OBJ_MODULE:-obj_abstention_$CANDLE_MODEL_IMPL} +# This is used by the obj_app objective function +export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model_abstention.sh + +# log_path PYTHONPATH + +WORKFLOW_SWIFT=${WORKFLOW_SWIFT:-workflow-gauss-abs.swift} +echo "WORKFLOW_SWIFT: $WORKFLOW_SWIFT" + +WAIT_ARG="" +if (( ${WAIT:-0} )) +then + WAIT_ARG="-t w" + echo "Turbine will wait for job completion." +fi + +if [[ ${MACHINE:-} == "" ]] +then + STDOUT=$TURBINE_OUTPUT/output.txt + # The turbine-output link is only created on scheduled systems, + # so if running locally, we create it here so the test*.sh wrappers + # can find it + [[ -L turbine-output ]] && rm turbine-output + ln -s $TURBINE_OUTPUT turbine-output +else + # When running on a scheduled system, Swift/T automatically redirects + # stdout to the turbine-output directory. This will just be for + # warnings or unusual messages + # use for summit (slurm needs two %) + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" + + #export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%r.txt" + mkdir -pv $TURBINE_OUTPUT/out + STDOUT="" +fi + +#echo ${CMD_LINE_ARGS[@]} + +cd $TURBINE_OUTPUT +cp $CFG_SYS $CFG_PRM $WORKFLOWS_ROOT/uq-noise/swift/workflow-gauss-abs.swift $TURBINE_OUTPUT + +if [[ ${SITE} == "summit" ]] +then + export TURBINE_LAUNCH_OPTIONS="-r6 -a1 -g1 -c7" +fi +TURBINE_RESIDENT_WORK_WORKERS=1 + +swift-t -n $PROCS \ + ${MACHINE:-} \ + -p \ + -I $OBJ_DIR \ + -i $OBJ_MODULE \ + -e LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} \ + -e TURBINE_RESIDENT_WORK_WORKERS=$TURBINE_RESIDENT_WORK_WORKERS \ + -e TURBINE_STDOUT \ + -e RESIDENT_WORK_RANKS=$RESIDENT_WORK_RANKS \ + -e BENCHMARKS_ROOT \ + -e EMEWS_PROJECT_ROOT \ + -e XCORR_ROOT \ + -e APP_PYTHONPATH=$APP_PYTHONPATH \ + $( python_envs ) \ + -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ + -e OBJ_RETURN \ + -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ + -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ + -e MODEL_SH \ + -e MODEL_NAME \ + -e SITE \ + -e BENCHMARK_TIMEOUT \ + -e BENCHMARKS_ROOT \ + -e SH_TIMEOUT \ + -e IGNORE_ERRORS \ + $WAIT_ARG \ + $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} 2>&1 \ + tee $STDOUT + + +if (( ${PIPESTATUS[0]} )) +then + echo "workflow.sh: swift-t exited with error!" + exit 1 +fi + +# echo "EXIT CODE: 0" | tee -a $STDOUT diff --git a/workflows/uq-noise/swift/workflow-gauss-abs.swift b/workflows/uq-noise/swift/workflow-gauss-abs.swift new file mode 100644 index 00000000..a1fbb973 --- /dev/null +++ b/workflows/uq-noise/swift/workflow-gauss-abs.swift @@ -0,0 +1,71 @@ +/* + UQ NOISE SWIFT + Main workflow +*/ + +import assert; +import files; +import io; +import python; +import unix; +import sys; +import string; +import location; +import math; +import json; + +string FRAMEWORK = "keras"; + +string xcorr_root = getenv("XCORR_ROOT"); +string preprocess_rnaseq = getenv("PREPROP_RNASEQ"); +string emews_root = getenv("EMEWS_PROJECT_ROOT"); +string turbine_output = getenv("TURBINE_OUTPUT"); + +string exp_id = argv("exp_id"); +int benchmark_timeout = toint(argv("benchmark_timeout", "-1")); +string model_name = getenv("MODEL_NAME"); + +printf("UQ NOISE WORKFLOW.SWIFT"); +printf("TURBINE_OUTPUT: " + turbine_output); + +float std_dev_step = 0.025; // Difference between noises +int num_trials = 1; + +float num_std_dev_noise= 20; // Number of noise levels to try + +float std_dev_array[] = [0:num_std_dev_noise]; +int trials[] = [0:num_trials-1]; + +int feature_col = 50; +float feature_threshold = 0.01; +string add_noise = "false"; +string noise_correlated = "false"; +string gaussian_noise = "true"; + + +float abs_vals[] = [0.01964286183, 0.01785714711, 0.01785714711, 0.02500000596, 0.02500000596, 0.03035715009, 0.03392857526, 0.03392857526, 0.05892858122, 0.05714286438, 0.08928572493, 0.1000000047, 0.1053571467, 0.1821428537, 0.1732142823, 0.2124999974, 0.2339285719, 0.1982142861, 0.3696428559, 0.2250000026, 0.2999999991]; + +foreach level, i in std_dev_array +{ + foreach trial, k in trials + { + std_dev = level * std_dev_step; + run_id = "%0.2f-%01i" % (std_dev, k); + + max_abs = abs_vals[i]; + + params = ("{ \"label_noise\" : %f , " + + " \"max_abs\" : %f, " + + " \"std_dev\" : %f, " + + " \"gaussian_noise\" : %s, " + + " \"noise_correlated\" : %s, " + + " \"feature_col\" : %i, " + + " \"feature_threshold\" : %f, " + + " \"epochs\" : 100 } ") % + (std_dev, max_abs, std_dev, gaussian_noise, noise_correlated, feature_col, feature_threshold); + printf("running: %s", params); + result = obj(params, run_id); + printf("result %s : std_dev %0.2f : %s", + run_id, std_dev, result); + } +} diff --git a/workflows/uq-noise/swift/workflow-gnoise.sh b/workflows/uq-noise/swift/workflow-gnoise.sh new file mode 100755 index 00000000..98d173ff --- /dev/null +++ b/workflows/uq-noise/swift/workflow-gnoise.sh @@ -0,0 +1,195 @@ +#! /usr/bin/env bash +set -eu + +# UQ NOISE WORKFLOW +# Main entry point for UQ-NOISE workflow +# See README.adoc for more information + +# Autodetect this workflow directory +export EMEWS_PROJECT_ROOT=$( cd $( dirname $0 )/.. ; /bin/pwd ) +export WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. ; /bin/pwd ) +if [[ ! -d $EMEWS_PROJECT_ROOT/../../../Benchmarks ]] +then + echo "Could not find Benchmarks in: $EMEWS_PROJECT_ROOT/../../../Benchmarks" + exit 1 +fi +BENCHMARKS_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd) +export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} +BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/NT3 +export BENCHMARK_TIMEOUT +export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} + +XCORR_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../xcorr ; /bin/pwd) +export XCORR_ROOT=${XCORR_ROOT:-$XCORR_DEFAULT} + +SCRIPT_NAME=$(basename $0) + +# Source some utility functions used by EMEWS in this script +source $WORKFLOWS_ROOT/common/sh/utils.sh + +usage() +{ + echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME" +} + +if (( ${#} != 5 )) +then + usage + exit 1 +fi + +if ! { + get_site $1 # Sets SITE + get_expid $2 # Sets EXPID + get_cfg_sys $3 + get_cfg_prm $4 + MODEL_NAME=$5 + } +then + usage + exit 1 +fi + +echo "Running "$MODEL_NAME "workflow" + +source_site env $SITE +source_site sched $SITE + +# Set PYTHONPATH for BENCHMARK related stuff +PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$XCORR_ROOT +PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # needed for model_runner and logs + +export APP_PYTHONPATH=$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$XCORR_ROOT + +export TURBINE_JOBNAME="JOB:${EXPID}" + +if [ -z ${GPU_STRING+x} ]; +then + GPU_ARG="" +else + GPU_ARG="-gpus=$GPU_STRING" +fi + +mkdir -pv $TURBINE_OUTPUT + +DB_FILE=$TURBINE_OUTPUT/uq-noise.db +if [[ ! -f DB_FILE ]] +then + if [[ ${UQ_NOISE_ID:-} == "" ]] + then + if [[ ${EXPID:0:1} == "X" ]] + then + UQ_NOISE_ID=${EXPID:1} + else + UQ_NOISE_ID=$EXPID + fi + fi + # $EMEWS_PROJECT_ROOT/db/db-cplo-init $DB_FILE $UQ_NOISE_ID +fi + +CMD_LINE_ARGS=( -benchmark_timeout=$BENCHMARK_TIMEOUT + -exp_id=$EXPID + -site=$SITE + -db_file=$DB_FILE + $GPU_ARG + -cache_dir=$CACHE_DIR + ) + +USER_VARS=( $CMD_LINE_ARGS ) +# log variables and script to to TURBINE_OUTPUT directory +log_script + +# Make run directory in advance to reduce contention +mkdir -pv $TURBINE_OUTPUT/run +mkdir -pv $TURBINE_OUTPUT/data +mkdir -pv $CACHE_DIR +mkdir -pv $XCORR_DATA_DIR +mkdir -pv $TURBINE_OUTPUT/hpo_log + +# Allow the user to set an objective function +OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} +CANDLE_MODEL_IMPL="py" +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} +# This is used by the obj_app objective function +export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh + +# log_path PYTHONPATH + +WORKFLOW_SWIFT=${WORKFLOW_SWIFT:-workflow-gnoise.swift} +echo "WORKFLOW_SWIFT: $WORKFLOW_SWIFT" + +WAIT_ARG="" +if (( ${WAIT:-0} )) +then + WAIT_ARG="-t w" + echo "Turbine will wait for job completion." +fi + +if [[ ${MACHINE:-} == "" ]] +then + STDOUT=$TURBINE_OUTPUT/output.txt + # The turbine-output link is only created on scheduled systems, + # so if running locally, we create it here so the test*.sh wrappers + # can find it + [[ -L turbine-output ]] && rm turbine-output + ln -s $TURBINE_OUTPUT turbine-output +else + # When running on a scheduled system, Swift/T automatically redirects + # stdout to the turbine-output directory. This will just be for + # warnings or unusual messages + # use for summit (slurm needs two %) + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" + + #export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%r.txt" + mkdir -pv $TURBINE_OUTPUT/out + STDOUT="" +fi + +#echo ${CMD_LINE_ARGS[@]} + +cd $TURBINE_OUTPUT +cp $CFG_SYS $CFG_PRM $WORKFLOWS_ROOT/uq-noise/swift/workflow-noise.swift $TURBINE_OUTPUT + +if [[ ${SITE} == "summit" ]] +then + export TURBINE_LAUNCH_OPTIONS="-r6 -a1 -g1 -c7" +fi +TURBINE_RESIDENT_WORK_WORKERS=1 + +swift-t -n $PROCS \ + ${MACHINE:-} \ + -p \ + -I $OBJ_DIR \ + -i $OBJ_MODULE \ + -e LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} \ + -e TURBINE_RESIDENT_WORK_WORKERS=$TURBINE_RESIDENT_WORK_WORKERS \ + -e TURBINE_STDOUT \ + -e RESIDENT_WORK_RANKS=$RESIDENT_WORK_RANKS \ + -e BENCHMARKS_ROOT \ + -e EMEWS_PROJECT_ROOT \ + -e XCORR_ROOT \ + -e APP_PYTHONPATH=$APP_PYTHONPATH \ + $( python_envs ) \ + -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ + -e OBJ_RETURN \ + -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ + -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ + -e MODEL_SH \ + -e MODEL_NAME \ + -e SITE \ + -e BENCHMARK_TIMEOUT \ + -e BENCHMARKS_ROOT \ + -e SH_TIMEOUT \ + -e IGNORE_ERRORS \ + $WAIT_ARG \ + $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} |& \ + tee $STDOUT + + +if (( ${PIPESTATUS[0]} )) +then + echo "workflow.sh: swift-t exited with error!" + exit 1 +fi + +# echo "EXIT CODE: 0" | tee -a $STDOUT diff --git a/workflows/uq-noise/swift/workflow-gnoise.swift b/workflows/uq-noise/swift/workflow-gnoise.swift new file mode 100644 index 00000000..6ee4bf11 --- /dev/null +++ b/workflows/uq-noise/swift/workflow-gnoise.swift @@ -0,0 +1,64 @@ +/* + UQ NOISE SWIFT + Main workflow +*/ + +import assert; +import files; +import io; +import python; +import unix; +import sys; +import string; +import location; +import math; +import json; + +string FRAMEWORK = "keras"; + +string xcorr_root = getenv("XCORR_ROOT"); +string preprocess_rnaseq = getenv("PREPROP_RNASEQ"); +string emews_root = getenv("EMEWS_PROJECT_ROOT"); +string turbine_output = getenv("TURBINE_OUTPUT"); + +string exp_id = argv("exp_id"); +int benchmark_timeout = toint(argv("benchmark_timeout", "-1")); +string model_name = getenv("MODEL_NAME"); + +printf("UQ NOISE WORKFLOW.SWIFT"); +printf("TURBINE_OUTPUT: " + turbine_output); + +float std_dev_step = 0.025; // Difference between noises +int num_trials = 1; + +float num_std_dev_noise= 20; // Number of noise levels to try + +float std_dev_array[] = [0:num_std_dev_noise]; +int trials[] = [0:num_trials-1]; + +int feature_col = 50; +float feature_threshold = 0.02; +string add_noise = "false"; +string gaussian_noise = "true"; +string noise_correlated = "false"; + +foreach level, i in std_dev_array +{ + foreach trial, k in trials + { + std_dev = level * std_dev_step; + run_id = "%0.3f-%01i" % (std_dev, k); + params = ("{ \"std_dev\" : %f , " + + " \"add_noise\" : %s, " + + " \"gaussian_noise\" : %s, " + + " \"noise_correlated\" : %s, " + + " \"feature_threshold\" : %f, " + + " \"feature_col\" : %i, " + + " \"epochs\" : 200 } ") % + (std_dev, add_noise, gaussian_noise, noise_correlated, feature_threshold, feature_col); + printf("running: %s", params); + result = obj(params, run_id); + printf("result %s : std_dev %0.2f : %s", + run_id, std_dev, result); + } +} diff --git a/workflows/uq-noise/swift/workflow-noise.sh b/workflows/uq-noise/swift/workflow-noise.sh new file mode 100755 index 00000000..ea50e09c --- /dev/null +++ b/workflows/uq-noise/swift/workflow-noise.sh @@ -0,0 +1,195 @@ +#! /usr/bin/env bash +set -eu + +# UQ NOISE WORKFLOW +# Main entry point for UQ-NOISE workflow +# See README.adoc for more information + +# Autodetect this workflow directory +export EMEWS_PROJECT_ROOT=$( cd $( dirname $0 )/.. ; /bin/pwd ) +export WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. ; /bin/pwd ) +if [[ ! -d $EMEWS_PROJECT_ROOT/../../../Benchmarks ]] +then + echo "Could not find Benchmarks in: $EMEWS_PROJECT_ROOT/../../../Benchmarks" + exit 1 +fi +BENCHMARKS_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../../../Benchmarks ; /bin/pwd) +export BENCHMARKS_ROOT=${BENCHMARKS_ROOT:-${BENCHMARKS_DEFAULT}} +BENCHMARKS_DIR_BASE=$BENCHMARKS_ROOT/Pilot1/NT3 +export BENCHMARK_TIMEOUT +export BENCHMARK_DIR=${BENCHMARK_DIR:-$BENCHMARKS_DIR_BASE} + +XCORR_DEFAULT=$( cd $EMEWS_PROJECT_ROOT/../xcorr ; /bin/pwd) +export XCORR_ROOT=${XCORR_ROOT:-$XCORR_DEFAULT} + +SCRIPT_NAME=$(basename $0) + +# Source some utility functions used by EMEWS in this script +source $WORKFLOWS_ROOT/common/sh/utils.sh + +usage() +{ + echo "workflow.sh: usage: workflow.sh SITE EXPID CFG_SYS CFG_PRM MODEL_NAME" +} + +if (( ${#} != 5 )) +then + usage + exit 1 +fi + +if ! { + get_site $1 # Sets SITE + get_expid $2 # Sets EXPID + get_cfg_sys $3 + get_cfg_prm $4 + MODEL_NAME=$5 + } +then + usage + exit 1 +fi + +echo "Running "$MODEL_NAME "workflow" + +source_site env $SITE +source_site sched $SITE + +# Set PYTHONPATH for BENCHMARK related stuff +PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$XCORR_ROOT +PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # needed for model_runner and logs + +export APP_PYTHONPATH=$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$XCORR_ROOT + +export TURBINE_JOBNAME="JOB:${EXPID}" + +if [ -z ${GPU_STRING+x} ]; +then + GPU_ARG="" +else + GPU_ARG="-gpus=$GPU_STRING" +fi + +mkdir -pv $TURBINE_OUTPUT + +DB_FILE=$TURBINE_OUTPUT/uq-noise.db +if [[ ! -f DB_FILE ]] +then + if [[ ${UQ_NOISE_ID:-} == "" ]] + then + if [[ ${EXPID:0:1} == "X" ]] + then + UQ_NOISE_ID=${EXPID:1} + else + UQ_NOISE_ID=$EXPID + fi + fi + # $EMEWS_PROJECT_ROOT/db/db-cplo-init $DB_FILE $UQ_NOISE_ID +fi + +CMD_LINE_ARGS=( -benchmark_timeout=$BENCHMARK_TIMEOUT + -exp_id=$EXPID + -site=$SITE + -db_file=$DB_FILE + $GPU_ARG + -cache_dir=$CACHE_DIR + ) + +USER_VARS=( $CMD_LINE_ARGS ) +# log variables and script to to TURBINE_OUTPUT directory +log_script + +# Make run directory in advance to reduce contention +mkdir -pv $TURBINE_OUTPUT/run +mkdir -pv $TURBINE_OUTPUT/data +mkdir -pv $CACHE_DIR +mkdir -pv $XCORR_DATA_DIR +mkdir -pv $TURBINE_OUTPUT/hpo_log + +# Allow the user to set an objective function +OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} +CANDLE_MODEL_IMPL="py" +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} +# This is used by the obj_app objective function +export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh + +# log_path PYTHONPATH + +WORKFLOW_SWIFT=${WORKFLOW_SWIFT:-workflow-noise.swift} +echo "WORKFLOW_SWIFT: $WORKFLOW_SWIFT" + +WAIT_ARG="" +if (( ${WAIT:-0} )) +then + WAIT_ARG="-t w" + echo "Turbine will wait for job completion." +fi + +if [[ ${MACHINE:-} == "" ]] +then + STDOUT=$TURBINE_OUTPUT/output.txt + # The turbine-output link is only created on scheduled systems, + # so if running locally, we create it here so the test*.sh wrappers + # can find it + [[ -L turbine-output ]] && rm turbine-output + ln -s $TURBINE_OUTPUT turbine-output +else + # When running on a scheduled system, Swift/T automatically redirects + # stdout to the turbine-output directory. This will just be for + # warnings or unusual messages + # use for summit (slurm needs two %) + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" + + #export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%r.txt" + mkdir -pv $TURBINE_OUTPUT/out + STDOUT="" +fi + +#echo ${CMD_LINE_ARGS[@]} + +cd $TURBINE_OUTPUT +cp $CFG_SYS $CFG_PRM $WORKFLOWS_ROOT/uq-noise/swift/workflow-noise.swift $TURBINE_OUTPUT + +if [[ ${SITE} == "summit" ]] +then + export TURBINE_LAUNCH_OPTIONS="-r6 -a1 -g1 -c7" +fi +TURBINE_RESIDENT_WORK_WORKERS=1 + +swift-t -n $PROCS \ + ${MACHINE:-} \ + -p \ + -I $OBJ_DIR \ + -i $OBJ_MODULE \ + -e LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-} \ + -e TURBINE_RESIDENT_WORK_WORKERS=$TURBINE_RESIDENT_WORK_WORKERS \ + -e TURBINE_STDOUT \ + -e RESIDENT_WORK_RANKS=$RESIDENT_WORK_RANKS \ + -e BENCHMARKS_ROOT \ + -e EMEWS_PROJECT_ROOT \ + -e XCORR_ROOT \ + -e APP_PYTHONPATH=$APP_PYTHONPATH \ + $( python_envs ) \ + -e TURBINE_OUTPUT=$TURBINE_OUTPUT \ + -e OBJ_RETURN \ + -e MODEL_PYTHON_SCRIPT=${MODEL_PYTHON_SCRIPT:-} \ + -e MODEL_PYTHON_DIR=${MODEL_PYTHON_DIR:-} \ + -e MODEL_SH \ + -e MODEL_NAME \ + -e SITE \ + -e BENCHMARK_TIMEOUT \ + -e BENCHMARKS_ROOT \ + -e SH_TIMEOUT \ + -e IGNORE_ERRORS \ + $WAIT_ARG \ + $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} |& \ + tee $STDOUT + + +if (( ${PIPESTATUS[0]} )) +then + echo "workflow.sh: swift-t exited with error!" + exit 1 +fi + +# echo "EXIT CODE: 0" | tee -a $STDOUT diff --git a/workflows/uq-noise/swift/workflow-noise.swift b/workflows/uq-noise/swift/workflow-noise.swift new file mode 100644 index 00000000..52b9dad0 --- /dev/null +++ b/workflows/uq-noise/swift/workflow-noise.swift @@ -0,0 +1,62 @@ +/* + UQ NOISE SWIFT + Main workflow +*/ + +import assert; +import files; +import io; +import python; +import unix; +import sys; +import string; +import location; +import math; +import json; + +string FRAMEWORK = "keras"; + +string xcorr_root = getenv("XCORR_ROOT"); +string preprocess_rnaseq = getenv("PREPROP_RNASEQ"); +string emews_root = getenv("EMEWS_PROJECT_ROOT"); +string turbine_output = getenv("TURBINE_OUTPUT"); + +string exp_id = argv("exp_id"); +int benchmark_timeout = toint(argv("benchmark_timeout", "-1")); +string model_name = getenv("MODEL_NAME"); + +printf("UQ NOISE WORKFLOW.SWIFT"); +printf("TURBINE_OUTPUT: " + turbine_output); + +float noise_step = 5.0; // Difference between noises +int num_trials = 1; + +float num_label_noise= 20; // Number of noise levels to try + +float label_noise_array[] = [0:num_label_noise]; +int trials[] = [0:num_trials-1]; + +int feature_col = 11180; +float feature_threshold = 0.01; +string add_noise = "true"; +string noise_correlated = "true"; + +foreach level, i in label_noise_array +{ + foreach trial, k in trials + { + label_noise = level * noise_step/100; + run_id = "%0.2f-%01i" % (label_noise, k); + params = ("{ \"label_noise\" : %f , " + + " \"add_noise\" : %s, " + + " \"noise_correlated\" : %s, " + + " \"feature_threshold\" : %f, " + + " \"feature_col\" : %i, " + + " \"epochs\" : 200 } ") % + (label_noise, add_noise, noise_correlated, feature_threshold, feature_col); + printf("running: %s", params); + result = obj(params, run_id); + printf("result %s : label_noise %0.2f : %s", + run_id, label_noise, result); + } +} diff --git a/workflows/uq-noise/swift/workflow.sh b/workflows/uq-noise/swift/workflow.sh index 7c82bdaa..926ea292 100755 --- a/workflows/uq-noise/swift/workflow.sh +++ b/workflows/uq-noise/swift/workflow.sh @@ -56,7 +56,7 @@ source_site env $SITE source_site sched $SITE # Set PYTHONPATH for BENCHMARK related stuff -PYTHONPATH+=:$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$XCORR_ROOT +PYTHONPATH+=:$BENCHMARK_DIR:$XCORR_ROOT PYTHONPATH+=:$WORKFLOWS_ROOT/common/python # needed for model_runner and logs export APP_PYTHONPATH=$BENCHMARK_DIR:$BENCHMARKS_ROOT/common:$XCORR_ROOT @@ -87,6 +87,12 @@ then # $EMEWS_PROJECT_ROOT/db/db-cplo-init $DB_FILE $UQ_NOISE_ID fi + + +# Set up PYTHONPATH for model +source $WORKFLOWS_ROOT/common/sh/set-pythonpath.sh + + CMD_LINE_ARGS=( -benchmark_timeout=$BENCHMARK_TIMEOUT -exp_id=$EXPID -site=$SITE @@ -108,7 +114,8 @@ mkdir -pv $TURBINE_OUTPUT/hpo_log # Allow the user to set an objective function OBJ_DIR=${OBJ_DIR:-$WORKFLOWS_ROOT/common/swift} -OBJ_MODULE=${OBJ_MODULE:-obj_$SWIFT_IMPL} +CANDLE_MODEL_IMPL="py" +OBJ_MODULE=${OBJ_MODULE:-model_$CANDLE_MODEL_IMPL} # This is used by the obj_app objective function export MODEL_SH=$WORKFLOWS_ROOT/common/sh/model.sh @@ -124,6 +131,18 @@ then echo "Turbine will wait for job completion." fi +# Handle %-escapes in TURBINE_STDOUT +if [ $SITE == "summit" ] || \ + [ $SITE == "biowulf" ] || \ + [ $SITE == "polaris" ] +then + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%%r.txt" +else + export TURBINE_STDOUT="$TURBINE_OUTPUT/out/out-%r.txt" +fi + +mkdir -pv $TURBINE_OUTPUT/out + if [[ ${MACHINE:-} == "" ]] then STDOUT=$TURBINE_OUTPUT/output.txt @@ -154,6 +173,10 @@ then export TURBINE_LAUNCH_OPTIONS="-g6 -c42 -a1 -b packed:42" fi + +export TURBINE_DIRECTIVE="#BSUB -q batch-hm" +TURBINE_RESIDENT_WORK_WORKERS=1 + swift-t -n $PROCS \ ${MACHINE:-} \ -p \ @@ -180,7 +203,7 @@ swift-t -n $PROCS \ -e SH_TIMEOUT \ -e IGNORE_ERRORS \ $WAIT_ARG \ - $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} |& \ + $EMEWS_PROJECT_ROOT/swift/$WORKFLOW_SWIFT ${CMD_LINE_ARGS[@]} 2>&1 \ tee $STDOUT @@ -191,4 +214,3 @@ then fi # echo "EXIT CODE: 0" | tee -a $STDOUT - diff --git a/workflows/uq-noise/swift/workflow.swift b/workflows/uq-noise/swift/workflow.swift index 9c89f57c..461c8de9 100644 --- a/workflows/uq-noise/swift/workflow.swift +++ b/workflows/uq-noise/swift/workflow.swift @@ -52,10 +52,9 @@ foreach levelx, i in x_noise_levels " \"epochs\" : 1 } ") % (x_noise_level, y_noise_level); printf("running: %s", params); - result = obj(params, run_id); + result = obj(params, exp_id, run_id); printf("result %s : x_noise %0.3f y_noise %0.3f : %s", run_id, x_noise_level, y_noise_level, result); } } } - diff --git a/workflows/uq-noise/swift/xy_workflow.swift b/workflows/uq-noise/swift/xy_workflow.swift index 929feeb7..56c8ea4c 100644 --- a/workflows/uq-noise/swift/xy_workflow.swift +++ b/workflows/uq-noise/swift/xy_workflow.swift @@ -53,4 +53,3 @@ foreach levelx, i in x_noise_levels } } } - diff --git a/workflows/uq-noise/test/abstention.sh b/workflows/uq-noise/test/abstention.sh new file mode 100755 index 00000000..45259f54 --- /dev/null +++ b/workflows/uq-noise/test/abstention.sh @@ -0,0 +1,65 @@ +#!/bin/bash +set -eu + +# UQ NOISE TEST 1 + +usage() +{ + echo "Usage: test SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 2 )) +then + RUN_DIR=$2 +elif (( ${#} == 1 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=nt3 +SITE=$1 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-1.sh +# export CFG_SYS=$THIS/cfg-sys-big.sh +export CFG_PRM=$THIS/cfg-prm-1.sh + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export OBJ_RETURN="val_loss" + +if [[ $SITE == "theta" ]] +then + export WAIT=1 +fi + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow-abstention.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME + +# Wait for job +TURBINE_OUTPUT=$( readlink turbine-output ) +queue_wait + +# Check job output +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: diff --git a/workflows/uq-noise/test/cfg-sys-1.sh b/workflows/uq-noise/test/cfg-sys-1.sh index d1c515a5..932d8daa 100644 --- a/workflows/uq-noise/test/cfg-sys-1.sh +++ b/workflows/uq-noise/test/cfg-sys-1.sh @@ -4,7 +4,7 @@ # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEMS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-2} +export PROCS=${PROCS:-3} # export PROCS=${PROCS:-128} # MPI processes per node @@ -12,11 +12,13 @@ export PROCS=${PROCS:-2} export PPN=${PPN:-1} # For Theta: -export QUEUE=${QUEUE:-debug-cache-quad} +#export QUEUE=${QUEUE:-batch-hm} # export QUEUE=R.candle export WALLTIME=${WALLTIME:-02:00:00} +RESIDENT_WORK_RANKS=$(( PROCS - 2 )) + # command separated list of gpu ids # export GPU_STRING=${GPU_STRING:-0} #export TURBINE_LAUNCH_OPTIONS="-a6 -g6 -c42" diff --git a/workflows/uq-noise/test/cfg-sys-small.sh b/workflows/uq-noise/test/cfg-sys-small.sh index e009c597..9e63a85e 100644 --- a/workflows/uq-noise/test/cfg-sys-small.sh +++ b/workflows/uq-noise/test/cfg-sys-small.sh @@ -4,7 +4,7 @@ # The number of MPI processes # Note that 2 processes are reserved for Swift/EMEMS # The default of 4 gives you 2 workers, i.e., 2 concurrent Keras runs -export PROCS=${PROCS:-2} +export PROCS=${PROCS:-3} # export PROCS=${PROCS:-128} # MPI processes per node @@ -12,11 +12,14 @@ export PROCS=${PROCS:-2} export PPN=${PPN:-1} # For Theta: -export QUEUE=${QUEUE:-debug-cache-quad} +# export QUEUE=${QUEUE:-debug-cache-quad} # export QUEUE=R.candle export WALLTIME=${WALLTIME:-00:15:00} +RESIDENT_WORK_RANKS=$(( PROCS - 2 )) + + # command separated list of gpu ids # export GPU_STRING=${GPU_STRING:-0} #export TURBINE_LAUNCH_OPTIONS="-a6 -g6 -c42" diff --git a/workflows/uq-noise/test/gauss-abs.sh b/workflows/uq-noise/test/gauss-abs.sh new file mode 100755 index 00000000..7a0fa824 --- /dev/null +++ b/workflows/uq-noise/test/gauss-abs.sh @@ -0,0 +1,66 @@ +#!/bin/bash +set -eu + +# UQ NOISE TEST 1 + +usage() +{ + echo "Usage: test SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 2 )) +then + RUN_DIR=$2 +elif (( ${#} == 1 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=nt3 +SITE=$1 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-1.sh +# export CFG_SYS=$THIS/cfg-sys-big.sh +export CFG_PRM=$THIS/cfg-prm-1.sh + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export OBJ_RETURN="val_loss" +export CANDLE_MODEL_TYPE="BENCHMARKS" + +if [[ $SITE == "theta" ]] +then + export WAIT=1 +fi + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow-gauss-abs.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME + +# Wait for job +TURBINE_OUTPUT=$( readlink turbine-output ) +queue_wait + +# Check job output +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: diff --git a/workflows/uq-noise/test/gnoise.sh b/workflows/uq-noise/test/gnoise.sh new file mode 100755 index 00000000..88fe90c7 --- /dev/null +++ b/workflows/uq-noise/test/gnoise.sh @@ -0,0 +1,65 @@ +#!/bin/bash +set -eu + +# UQ NOISE TEST 1 + +usage() +{ + echo "Usage: test SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 2 )) +then + RUN_DIR=$2 +elif (( ${#} == 1 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=nt3 +SITE=$1 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-1.sh +# export CFG_SYS=$THIS/cfg-sys-big.sh +export CFG_PRM=$THIS/cfg-prm-1.sh + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export OBJ_RETURN="val_loss" + +if [[ $SITE == "theta" ]] +then + export WAIT=1 +fi + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow-gnoise.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME + +# Wait for job +TURBINE_OUTPUT=$( readlink turbine-output ) +queue_wait + +# Check job output +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: diff --git a/workflows/uq-noise/test/test-1.sh b/workflows/uq-noise/test/test-1.sh index eed7152d..9dd15f9d 100755 --- a/workflows/uq-noise/test/test-1.sh +++ b/workflows/uq-noise/test/test-1.sh @@ -40,6 +40,8 @@ export CFG_PRM=$THIS/cfg-prm-1.sh # val_loss (default) and val_corr are supported export OBJ_RETURN="val_loss" +export CANDLE_MODEL_TYPE="BENCHMARKS" + if [[ $SITE == "theta" ]] then export WAIT=1 diff --git a/workflows/uq-noise/test/test-noise.sh b/workflows/uq-noise/test/test-noise.sh new file mode 100755 index 00000000..6513526f --- /dev/null +++ b/workflows/uq-noise/test/test-noise.sh @@ -0,0 +1,65 @@ +#!/bin/bash +set -eu + +# UQ NOISE TEST 1 + +usage() +{ + echo "Usage: test SITE RUN_DIR(optional)" + echo " RUN_DIR is optional, use -a for automatic" +} + +RUN_DIR="" +if (( ${#} == 2 )) +then + RUN_DIR=$2 +elif (( ${#} == 1 )) # test-all uses this +then + RUN_DIR="-a" +else + usage + exit 1 +fi + +export MODEL_NAME=nt3 +SITE=$1 + +# Self-configure +THIS=$( cd $( dirname $0 ) && /bin/pwd ) +EMEWS_PROJECT_ROOT=$( cd $THIS/.. && /bin/pwd ) +export EMEWS_PROJECT_ROOT +WORKFLOWS_ROOT=$( cd $EMEWS_PROJECT_ROOT/.. && /bin/pwd ) +source $WORKFLOWS_ROOT/common/sh/utils.sh + +# Select configurations +export CFG_SYS=$THIS/cfg-sys-1.sh +# export CFG_SYS=$THIS/cfg-sys-big.sh +export CFG_PRM=$THIS/cfg-prm-1.sh + +# What to return from the objective function (Keras model) +# val_loss (default) and val_corr are supported +export OBJ_RETURN="val_loss" + +if [[ $SITE == "theta" ]] +then + export WAIT=1 +fi + +# Submit job +$EMEWS_PROJECT_ROOT/swift/workflow-noise.sh $SITE $RUN_DIR $CFG_SYS $CFG_PRM $MODEL_NAME + +# Wait for job +TURBINE_OUTPUT=$( readlink turbine-output ) +queue_wait + +# Check job output +OUTPUT=$TURBINE_OUTPUT/output.txt +WORKFLOW=$( basename $EMEWS_PROJECT_ROOT ) + +SCRIPT=$( basename $0 .sh ) + +echo "$SCRIPT: SUCCESS" + +# Local Variables: +# c-basic-offset: 2; +# End: diff --git a/workflows/xcorr/CandleFeatureSelectionFunction.py b/workflows/xcorr/CandleFeatureSelectionFunction.py index f3ae13e6..bcfc8216 100644 --- a/workflows/xcorr/CandleFeatureSelectionFunction.py +++ b/workflows/xcorr/CandleFeatureSelectionFunction.py @@ -2,7 +2,6 @@ import pandas as pd - # Use cross-correlation to select the features that are generalizable between data1 and data2. # data1: an array, where rows are samples and columns are features # data2: an array, where rows are samples and columns are features. data1 and data2 should have an equal @@ -17,8 +16,12 @@ def crossCorrelation_FS(data1, data2, cutoff): num = data1.shape[1] cor = [] for i in range(num): - cor.append(np.corrcoef(np.vstack((list(cor1[:i, i]) + list(cor1[(i + 1):, i]), - list(cor2[:i, i]) + list(cor2[(i + 1):, i]))))[0, 1]) + cor.append( + np.corrcoef( + np.vstack(( + list(cor1[:i, i]) + list(cor1[(i + 1):, i]), + list(cor2[:i, i]) + list(cor2[(i + 1):, i]), + )))[0, 1]) cor = np.array(cor) if cutoff < 1: fid = np.where(cor >= cutoff)[0] @@ -27,7 +30,6 @@ def crossCorrelation_FS(data1, data2, cutoff): return sorted(fid) - # Use COXEN approach to select predictive and generalizable genes for prediction. # study1: the name of study 1, should be one of 'CCLE', 'CTRP', 'gCSI', 'GDSC', 'NCI60' # study2: the name of study 2, should be one of 'CCLE', 'CTRP', 'gCSI', 'GDSC', 'NCI60' @@ -41,26 +43,41 @@ def crossCorrelation_FS(data1, data2, cutoff): # whose cross-correlation coefficient >= cutoffCrossCorrelation are selected. If cutoffCrossCorrelation >= 1, # it must be an integer indicating the number of genes to be selected based on cross-correlation coefficient. -def COXEN_FeatureSelection(study1, study2, rnaSeqData, drugResponseData, cutoffCorrelation=200, cutoffCrossCorrelation=100): + +def COXEN_FeatureSelection( + study1, + study2, + rnaSeqData, + drugResponseData, + cutoffCorrelation=200, + cutoffCrossCorrelation=100, +): # get rnaSeq data of study1 and study2 - study = np.array([i.split('.')[0] for i in rnaSeqData.index]) + study = np.array([i.split(".")[0] for i in rnaSeqData.index]) data1 = rnaSeqData.iloc[np.where(study == study1)[0], :] data2 = rnaSeqData.iloc[np.where(study == study2)[0], :] # keep only drug response data of cell lines in data1 - drugResponseData = drugResponseData.iloc[np.where(drugResponseData.SOURCE == study1)[0], :] - drugResponseData = drugResponseData.iloc[np.where(np.isin(drugResponseData.CELLNAME, data1.index))[0], :] + drugResponseData = drugResponseData.iloc[np.where( + drugResponseData.SOURCE == study1)[0], :] + drugResponseData = drugResponseData.iloc[ + np.where(np.isin(drugResponseData.CELLNAME, data1.index))[0], :] # perform the first step of COXEN approach to select predictive genes. To avoid exceeding the memory limit, # the prediction power of genes (i.e. absolute correlation coefficient with drug response) is calculated in batches. batchSize = 100 - numBatch = int(np.ceil(data1.shape[1]/batchSize)) + numBatch = int(np.ceil(data1.shape[1] / batchSize)) cor = np.empty((data1.shape[1], 1)) for i in range(numBatch): - startIndex = i*batchSize - endIndex = min((i+1)*batchSize, data1.shape[1]) - cor_i = np.corrcoef(np.vstack((np.transpose(data1.iloc[:, startIndex:endIndex].loc[drugResponseData.CELLNAME, - :].values), np.reshape(drugResponseData.GROWTH.values, (1, drugResponseData.shape[0]))))) + startIndex = i * batchSize + endIndex = min((i + 1) * batchSize, data1.shape[1]) + cor_i = np.corrcoef( + np.vstack(( + np.transpose(data1.iloc[:, startIndex:endIndex].loc[ + drugResponseData.CELLNAME, :].values), + np.reshape(drugResponseData.GROWTH.values, + (1, drugResponseData.shape[0])), + ))) cor[startIndex:endIndex, 0] = abs(cor_i[:-1, -1]) if cutoffCorrelation < 1: gid1 = np.where(cor >= cutoffCorrelation)[0] @@ -72,22 +89,41 @@ def COXEN_FeatureSelection(study1, study2, rnaSeqData, drugResponseData, cutoffC data2 = data2.iloc[:, gid1] # perform the second step of COXEN approach to select generalizable genes among the predictive genes - gid2 = crossCorrelation_FS(data1.values, data2.values, cutoffCrossCorrelation) + gid2 = crossCorrelation_FS(data1.values, data2.values, + cutoffCrossCorrelation) # return the gene names return data1.columns[gid2] - # Load data. -rnaSeqData = pd.read_csv('/home/nick/Documents/repos/Benchmarks/Data/Pilot1/combined_rnaseq_data_lincs1000_combat', sep='\t', engine='c', na_values=['na', '-', ''], - header=0, index_col=0) -drugResponseData = pd.read_csv('/home/nick/Documents/repos/Benchmarks/Data/Pilot1/rescaled_combined_single_drug_growth', sep='\t', engine='c', - na_values=['na', '-', ''], header=0, index_col=None) +rnaSeqData = pd.read_csv( + "/home/nick/Documents/repos/Benchmarks/Data/Pilot1/combined_rnaseq_data_lincs1000_combat", + sep="\t", + engine="c", + na_values=["na", "-", ""], + header=0, + index_col=0, +) +drugResponseData = pd.read_csv( + "/home/nick/Documents/repos/Benchmarks/Data/Pilot1/rescaled_combined_single_drug_growth", + sep="\t", + engine="c", + na_values=["na", "-", ""], + header=0, + index_col=None, +) # Sample selection and filtering should be done here by selecting a part of drugResponseData or a part of rnaSeqData. # The following line of code is just a example randomly selecting 10000 samples through subsetting drugResponseData. -drugResponseData = drugResponseData.iloc[np.random.permutation(drugResponseData.shape[0])[:10000], :] - -selectedGenes = COXEN_FeatureSelection(study1='CTRP', study2='CCLE', rnaSeqData=rnaSeqData, - drugResponseData=drugResponseData, cutoffCorrelation=100, cutoffCrossCorrelation=50) +drugResponseData = drugResponseData.iloc[ + np.random.permutation(drugResponseData.shape[0])[:10000], :] + +selectedGenes = COXEN_FeatureSelection( + study1="CTRP", + study2="CCLE", + rnaSeqData=rnaSeqData, + drugResponseData=drugResponseData, + cutoffCorrelation=100, + cutoffCrossCorrelation=50, +) diff --git a/workflows/xcorr/CandlePilotWorkflow.py b/workflows/xcorr/CandlePilotWorkflow.py index cedc9175..6ccbbe96 100644 --- a/workflows/xcorr/CandlePilotWorkflow.py +++ b/workflows/xcorr/CandlePilotWorkflow.py @@ -1,9 +1,9 @@ import sys + import numpy as np from scipy.stats import ttest_ind - # Could be used for NT3 (f(row) -> binary) # Use t-test to select features that are discriminative between two sample classes # data: an array, where rows are samples and columns are features (e.g., RNA expression row) @@ -16,12 +16,14 @@ def ttest_FS(data, label, cutoff): unique_label = list(set(label)) if len(unique_label) != 2: - print('T-test feature selection needs two sample classes') + print("T-test feature selection needs two sample classes") return None id0 = np.where(label == unique_label[0])[0] id1 = np.where(label == unique_label[1])[0] if len(id0) < 3 or len(id1) < 3: - print('T-test feature selection requires every sample class has at least 3 samples') + print( + "T-test feature selection requires every sample class has at least 3 samples" + ) return None t, p = ttest_ind(a=data[id0, :], b=data[id1, :], axis=0, equal_var=False) if cutoff < 1: @@ -41,16 +43,17 @@ def ttest_FS(data, label, cutoff): # integer indicating the number of features to be selected based on absolute correlation coefficient. # Returns a list of indices of the selected features. def correlation_FS(data, target, cutoff): - cor = np.corrcoef(np.vstack((np.transpose(data), np.reshape(target, (1, len(target)))))) + cor = np.corrcoef( + np.vstack((np.transpose(data), np.reshape(target, (1, len(target)))))) cor = abs(cor[:-1, -1]) if cutoff < 1: fid = np.where(cor >= cutoff)[0] else: - fid = sorted(range(len(cor)), key=lambda x: cor[x], reverse=True)[:int(cutoff)] + fid = sorted(range(len(cor)), key=lambda x: cor[x], + reverse=True)[:int(cutoff)] return sorted(fid) - # Use the COXEN approach to select the features that are generalizable between data1 and data2. # data1: an array, where rows are samples and columns are features # data2: an array, where rows are samples and columns are features. data1 and data2 should have an equal @@ -65,32 +68,34 @@ def COXEN_FS(data1, data2, cutoff): num = data1.shape[1] cor = [] for i in range(num): - cor.append(np.corrcoef(np.vstack((list(cor1[:i, i]) + list(cor1[(i + 1):, i]), - list(cor2[:i, i]) + list(cor2[(i + 1):, i]))))[0, 1]) + cor.append( + np.corrcoef( + np.vstack(( + list(cor1[:i, i]) + list(cor1[(i + 1):, i]), + list(cor2[:i, i]) + list(cor2[(i + 1):, i]), + )))[0, 1]) cor = np.array(cor) if cutoff < 1: fid = np.where(cor >= cutoff)[0] else: - fid = sorted(range(num), key=lambda x: cor[x], reverse=True)[:int(cutoff)] + fid = sorted(range(num), key=lambda x: cor[x], + reverse=True)[:int(cutoff)] return sorted(fid) - - - -numF = 10 # Number of features -numS = 50 # Number of samples to be multiplied by 2 +numF = 10 # Number of features +numS = 50 # Number of samples to be multiplied by 2 data1 = np.random.randn(numF, numS) for i in range(numF): - data1[i, :] = data1[i, :] + i/5 + data1[i, :] = data1[i, :] + i / 5 data2 = np.random.randn(numF, numS) data1 = np.hstack((data1, data2)) data1 = np.transpose(data1) label = np.array([0 for i in range(numS)] + [1 for i in range(numS)]) -data3 = np.random.randn(numF, int(numS/2)) +data3 = np.random.randn(numF, int(numS / 2)) for i in range(numF): - data3[i, :] = data3[i, :] + i/5 -data4 = np.random.randn(numF, int(numS/2)) + data3[i, :] = data3[i, :] + i / 5 +data4 = np.random.randn(numF, int(numS / 2)) data3 = np.hstack((data3, data4)) data3 = np.transpose(data3) diff --git a/workflows/xcorr/README.adoc b/workflows/xcorr/README.adoc index 11435d97..ea9f3bf8 100644 --- a/workflows/xcorr/README.adoc +++ b/workflows/xcorr/README.adoc @@ -14,14 +14,14 @@ COXEN approach takes the following steps. between the gene’s expression and the drug response value. Select _n_~1~ genes whose absolute correlation coefficients with drug response are the highest. -. For each of the _n_~1~ genes, do the following: +. For each of the _n_~1~ genes, do the following: .. Calculate its Pearson correlation coefficients with the other _n_~1~-1 genes based on their expression values in dataset 1, which forms a _n_~1~-1-dimensional -vector of Pearson correlation coefficients denoted by _c_~1~. -.. Calculate its Pearson correlation coefficients with the +vector of Pearson correlation coefficients denoted by _c_~1~. +.. Calculate its Pearson correlation coefficients with the . Among the _n_~1~ genes, select _n_~2~ genes whose COXEN scores are the -highest. +highest. With respect to using the results in to train a model, drug response prediction model would be trained using these _n_~2~ genes using @@ -47,11 +47,11 @@ the prediction model. The COXEN algorithm requires two input parameters _n_~1~ a _n_~2~, which are the number of candidate predictive genes and the number of selected genes in final output. These two parameters can be pre-determined before data analysis or tuned through hyperparameter search for identifying -their optimal values to build the prediction model. +their optimal values to build the prediction model. == Code -The COXEN implementation consists of two files: `xcorr.py` and `uno_xcorr.py`. +The COXEN implementation consists of two files: `xcorr.py` and `uno_xcorr.py`. * `xcorr.py` - implements COXEN correlation using numpy arrays to represent the datasets. This code encapsulates steps 1 and 2 in a generic way. See the @@ -59,8 +59,8 @@ documentation comments in each python function for more details. * `uno_xcorr.py` - runs the COXEN correlation code in `xcorr.py` on Pilot 1 gene and drug reponse data to produce cross correlated features files that -can be used with the Uno benchmark model. The module needs to be initialized -with gene and drug repsonse data via call to `uno_xcorr.init_uno_xcorr` before +can be used with the Uno benchmark model. The module needs to be initialized +with gene and drug repsonse data via call to `uno_xcorr.init_uno_xcorr` before running the cross correlation. For example, + @@ -79,17 +79,17 @@ a training run. For example, + ---- -uno_xcorr.coxen_feature_selection('CCLE', 'GDSC', 2000, 1000, +uno_xcorr.coxen_feature_selection('CCLE', 'GDSC', 2000, 1000, 'CCLE_GDSC_2000_1000_features.txt') ---- + where 'CCLE' and 'GDSC' are the names of cancer studies in the initialization data each with gene / drug treatment and response values. The call produces -a cross correlation file of the cell features of these two studies using +a cross correlation file of the cell features of these two studies using a correlation cutoff of 2000 (limiting __c_~1~_ in step 2.a above to those values >= 2000), and a cross -correlation cutoff of 1000 (limiting the results of step 2.c above to those +correlation cutoff of 1000 (limiting the results of step 2.c above to those >= 1000). diff --git a/workflows/xcorr/db-init.py b/workflows/xcorr/db-init.py index e88310cd..bc6fdbde 100644 --- a/workflows/xcorr/db-init.py +++ b/workflows/xcorr/db-init.py @@ -1,71 +1,73 @@ - # DB INIT PY # Initialize the SQLite DB # See db-init.sql for the table schema import sys -from xcorr_db import xcorr_db, q - from pathlib import Path + +from xcorr_db import q, xcorr_db + THIS = Path(sys.argv[0]).parent.resolve() -DB = xcorr_db('xcorr.db') +DB = xcorr_db("xcorr.db") DB.connect() + def create_tables(): - """ Set up the tables defined in the SQL file """ + """Set up the tables defined in the SQL file.""" global THIS - with open(str(THIS)+"/db-init.sql") as fp: + with open(str(THIS) + "/db-init.sql") as fp: sqlcode = fp.read() DB.executescript(sqlcode) DB.commit() + def insert_feature_names(): - """ - Copy features from the header of this datafile - into the features table - """ + """Copy features from the header of this datafile into the features + table.""" global THIS - datafile = str(THIS)+"/test_data/combined_rnaseq_data_lincs1000_combat" - #datafile = "test_data/combined_rnaseq_data_combat" + datafile = str(THIS) + "/test_data/combined_rnaseq_data_lincs1000_combat" + # datafile = "test_data/combined_rnaseq_data_combat" with open(datafile) as fp: line = fp.readline() feature_names = line.split("\t") - del feature_names[0] # Remove first token "Sample" + del feature_names[0] # Remove first token "Sample" for name in feature_names: - if name == "": continue + if name == "": + continue name = name.strip() - DB.insert(table="feature_names", - names=["name"], - values=[q(name)]) + DB.insert(table="feature_names", names=["name"], values=[q(name)]) + def insert_study_names(): - """ Copy study names from studies.txt into the DB """ + """Copy study names from studies.txt into the DB.""" global THIS studies = [] - with open(str(THIS)+"/studies.txt") as fp: + with open(str(THIS) + "/studies.txt") as fp: while True: line = fp.readline() - if line == "": break + if line == "": + break tokens = line.split("#") line = tokens[0] line = line.strip() - if line == "": continue + if line == "": + continue studies.append(line) for study in studies: - DB.insert(table="study_names", - names=["name"], - values=[q(study)]) + DB.insert(table="study_names", names=["name"], values=[q(study)]) + def create_indices(): - """ Create indices after data insertion for speed """ + """Create indices after data insertion for speed.""" DB.execute("create index features_index on features(record_id);") DB.execute("create index studies_index on studies ( study_id);") + # Catch and print all exceptions to improve visibility of success/failure success = False try: @@ -76,11 +78,13 @@ def create_indices(): success = True except Exception as e: import traceback + print(traceback.format_exc()) if not success: print("DB: !!! INIT FAILED !!!") import sys + sys.exit(1) print("DB: initialized successfully") diff --git a/workflows/xcorr/db-insert-junk.py b/workflows/xcorr/db-insert-junk.py index b701206c..9b1e54a2 100644 --- a/workflows/xcorr/db-insert-junk.py +++ b/workflows/xcorr/db-insert-junk.py @@ -1,4 +1,3 @@ - # DB INSERT JUNK PY # Test SQLite DB # See init-db.sql for the table schema @@ -7,23 +6,22 @@ import random import time -from xcorr_db import xcorr_db, q +from xcorr_db import q, xcorr_db -DB = xcorr_db('xcorr.db') +DB = xcorr_db("xcorr.db") feature_id2name, feature_name2id = DB.read_feature_names() -study_id2name, study_name2id = DB.read_study_names() +study_id2name, study_name2id = DB.read_study_names() feature_names = feature_name2id.keys() -study_names = study_name2id .keys() +study_names = study_name2id.keys() -for i in range(1,4): +for i in range(1, 4): cutoff_corr = 200 cutoff_xcorr = 50 - features = [ feature for feature in feature_names - if random.randint(0,300) == 0 ] - studies = [ study for study in study_names - if random.randint(0,1) == 0 ] - record = ( features, cutoff_corr, cutoff_xcorr ) - DB.insert_xcorr_record(studies, features, - cutoff_corr, cutoff_xcorr) + features = [ + feature for feature in feature_names if random.randint(0, 300) == 0 + ] + studies = [study for study in study_names if random.randint(0, 1) == 0] + record = (features, cutoff_corr, cutoff_xcorr) + DB.insert_xcorr_record(studies, features, cutoff_corr, cutoff_xcorr) diff --git a/workflows/xcorr/list-records.py b/workflows/xcorr/list-records.py index 58ba01f2..5df9c6e0 100644 --- a/workflows/xcorr/list-records.py +++ b/workflows/xcorr/list-records.py @@ -1,14 +1,13 @@ - # LIST RECORDS PY # List all the records in the DB and their metadata from record import Record -from xcorr_db import xcorr_db, q +from xcorr_db import q, xcorr_db DB = xcorr_db("xcorr.db") feature_id2name, feature_name2id = DB.read_feature_names() -study_id2name, study_name2id = DB.read_study_names() +study_id2name, study_name2id = DB.read_study_names() # Main list of records records = [] @@ -18,7 +17,8 @@ DB.execute("select rowid from records;") while True: row = DB.cursor.fetchone() - if row == None: break + if row == None: + break record_ids.append(row[0]) # Read the record data @@ -33,7 +33,8 @@ DB.execute("select * from features where record_id == %i;" % record_id) while True: row = DB.cursor.fetchone() - if row == None: break + if row == None: + break feature = feature_id2name[row[1]] record.features.append(feature) @@ -41,7 +42,8 @@ DB.execute("select * from studies where record_id == %i;" % record_id) while True: row = DB.cursor.fetchone() - if row == None: break + if row == None: + break study = study_id2name[row[1]] record.studies.append(study) diff --git a/workflows/xcorr/make-fake-data.py b/workflows/xcorr/make-fake-data.py index 3ee940e4..47f00c7c 100644 --- a/workflows/xcorr/make-fake-data.py +++ b/workflows/xcorr/make-fake-data.py @@ -1,27 +1,34 @@ -import numpy as np import os +import numpy as np + + def make_fake_data(out_dir): - numF = 10 # Number of features - numS = 50 # Number of samples to be multiplied by 2 + numF = 10 # Number of features + numS = 50 # Number of samples to be multiplied by 2 if not os.path.exists(out_dir): - os.makedirs(out_dir) + os.makedirs(out_dir) for j in range(6): data1 = np.random.randn(numF, numS) for i in range(numF): - data1[i, :] = data1[i, :] + i/5 + data1[i, :] = data1[i, :] + i / 5 data2 = np.random.randn(numF, numS) data1 = np.hstack((data1, data2)) data1 = np.transpose(data1) - data3 = np.random.randn(numF, int(numS/2)) + data3 = np.random.randn(numF, int(numS / 2)) for i in range(numF): - data3[i, :] = data3[i, :] + i/5 - data4 = np.random.randn(numF, int(numS/2)) + data3[i, :] = data3[i, :] + i / 5 + data4 = np.random.randn(numF, int(numS / 2)) data3 = np.hstack((data3, data4)) data3 = np.transpose(data3) - np.savetxt("{}/data{}.tsv".format(out_dir, j * 2), data1, delimiter='\t') - np.savetxt("{}/data{}.tsv".format(out_dir, j * 2 + 1), data3, delimiter='\t') + np.savetxt("{}/data{}.tsv".format(out_dir, j * 2), + data1, + delimiter="\t") + np.savetxt("{}/data{}.tsv".format(out_dir, j * 2 + 1), + data3, + delimiter="\t") + if __name__ == "__main__": - make_fake_data("./test_data") \ No newline at end of file + make_fake_data("./test_data") diff --git a/workflows/xcorr/record.py b/workflows/xcorr/record.py index 96bd92cf..9e16bcbf 100644 --- a/workflows/xcorr/record.py +++ b/workflows/xcorr/record.py @@ -1,17 +1,15 @@ - # RECORD PY # Represent a record in the DB + class Record: def __init__(self): self.features = [] - self.studies = [] + self.studies = [] def scan(self, row): - self.rowid, self.ts, self.cutoff_corr, self.cutoff_xcorr = \ - row[0:4] - + self.rowid, self.ts, self.cutoff_corr, self.cutoff_xcorr = row[0:4] def print(self): print("record: " + str(self.rowid)) diff --git a/workflows/xcorr/tests/uno_xcorr_tests.py b/workflows/xcorr/tests/uno_xcorr_tests.py index c0e3a51b..b537b887 100644 --- a/workflows/xcorr/tests/uno_xcorr_tests.py +++ b/workflows/xcorr/tests/uno_xcorr_tests.py @@ -1,42 +1,46 @@ import unittest -import uno_xcorr + import numpy as np +import uno_xcorr + # Run with: PYTHONPATH=UNO_BENCHMARK_PATH:BENCHMARK_COMMON_PATH python -m unittest tests.uno_sc -# E.g. PYTHONPATH=$HOME/Documents/repos/Benchmarks/Pilot1/Uno:$HOME/Documents/repos/Benchmarks/common +# E.g. PYTHONPATH=$HOME/Documents/repos/Benchmarks/Pilot1/Uno:$HOME/Documents/repos/Benchmarks/common # python -m unittest tests.uno_xcorr_tests class TestUnoXcorr(unittest.TestCase): def setUp(self): if uno_xcorr.gene_df is None: - dp = './test_data/rescaled_combined_single_drug_growth.bz2' - rp = './test_data/combined_rnaseq_data_lincs1000_combat.bz2' + dp = "./test_data/rescaled_combined_single_drug_growth.bz2" + rp = "./test_data/combined_rnaseq_data_lincs1000_combat.bz2" uno_xcorr.init_uno_xcorr(rp, dp) def test_init(self): shape = (15198, 943) self.assertEqual(shape[0], uno_xcorr.gene_df.shape[0]) self.assertEqual(shape[1], uno_xcorr.gene_df.shape[1]) - + shape = (27769716, 7) self.assertEqual(shape[0], uno_xcorr.drug_df.shape[0]) self.assertEqual(shape[1], uno_xcorr.drug_df.shape[1]) def test_source(self): - sources = ['CCLE', 'CTRP', 'GDC', 'GDSC', 'NCI60', 'NCIPDM', 'gCSI'] - df_sources = uno_xcorr.gene_df['source'].unique() + sources = ["CCLE", "CTRP", "GDC", "GDSC", "NCI60", "NCIPDM", "gCSI"] + df_sources = uno_xcorr.gene_df["source"].unique() self.assertEqual(sources, list(df_sources)) def test_xcorr(self): np.random.seed(42) - drug_ids = uno_xcorr.drug_df.iloc[np.random.permutation(uno_xcorr.drug_df.shape[0])[:10000], : ].DRUG_ID - f = './test_data/gene_out.txt' - uno_xcorr.coxen_feature_selection('CCLE', 'NCI60', 200, 200, drug_ids, f) + drug_ids = uno_xcorr.drug_df.iloc[np.random.permutation( + uno_xcorr.drug_df.shape[0])[:10000], :].DRUG_ID + f = "./test_data/gene_out.txt" + uno_xcorr.coxen_feature_selection("CCLE", "NCI60", 200, 200, drug_ids, + f) with open(f) as f_in: lines = f_in.readlines() self.assertEquals(200, len(lines)) - - -if __name__ == '__main__': + + +if __name__ == "__main__": unittest.main() diff --git a/workflows/xcorr/uno_xcorr.py b/workflows/xcorr/uno_xcorr.py index 8ab5a88e..2e530a3a 100644 --- a/workflows/xcorr/uno_xcorr.py +++ b/workflows/xcorr/uno_xcorr.py @@ -1,14 +1,15 @@ import os -import pandas as pd -import numpy as np +import numpy as np +import pandas as pd import xcorr gene_df = None drug_df = None + def init_uno_xcorr(rna_seq_path, drug_response_path, drug_ids=None): - """Initialize this package for xcorr and the Uno benchmark + """Initialize this package for xcorr and the Uno benchmark. :param rna_seq_path: path to gene expression data following the format of combined_rnaseq_data_combat @@ -16,51 +17,70 @@ def init_uno_xcorr(rna_seq_path, drug_response_path, drug_ids=None): rescaled_combined_single_drug_growth """ - rank = os.getenv('PMIX_RANK') - print('rank %s Setting up uno_xcorr...' % rank) + rank = os.getenv("PMIX_RANK") + print("rank %s Setting up uno_xcorr..." % rank) global gene_df - gene_df = pd.read_csv(rna_seq_path, compression='infer', sep='\t', engine='c', na_values=['na', '-', ''], - header=0, index_col=0) - gene_df['study'] = gene_df.index.str.extract('^([^.]*)', expand=False) + gene_df = pd.read_csv( + rna_seq_path, + compression="infer", + sep="\t", + engine="c", + na_values=["na", "-", ""], + header=0, + index_col=0, + ) + gene_df["study"] = gene_df.index.str.extract("^([^.]*)", expand=False) global drug_df - drug_df = pd.read_csv(drug_response_path, compression='infer', sep='\t', engine='c', - na_values=['na', '-', ''], header=0, index_col=None) + drug_df = pd.read_csv( + drug_response_path, + compression="infer", + sep="\t", + engine="c", + na_values=["na", "-", ""], + header=0, + index_col=None, + ) if drug_ids is not None: - drug_df = drug_df[drug_df['DRUG_ID'].isin(drug_ids)] + drug_df = drug_df[drug_df["DRUG_ID"].isin(drug_ids)] -def select_features(df, study_col, study='all'): - """ Selects and returns a data frame from features whose - study is equal to the specified study. If study is 'all' then - all features are returned. +def select_features(df, study_col, study="all"): + """Selects and returns a data frame from features whose study is equal to + the specified study. If study is 'all' then all features are returned. :param study: a string specifing the study -- one of 'CCLE', 'CTRP', 'gCSI', 'GDSC', 'NCI60' or 'all'. """ df1 = df - if study != 'all': + if study != "all": df1 = df1[df1[study_col] == study] return df1 ## TODO: add additional args / functions for additional sample selection -def coxen_feature_selection(study_1, study_2, correlation_cutoff, - cross_correlation_cutoff, drug_ids=None, output_file=None): - - df1 = select_features(gene_df, 'study', study_1) +def coxen_feature_selection( + study_1, + study_2, + correlation_cutoff, + cross_correlation_cutoff, + drug_ids=None, + output_file=None, +): + + df1 = select_features(gene_df, "study", study_1) # add namespace prefix as required by Uno - df1 = df1.drop(['study'], axis=1).add_prefix("rnaseq.") + df1 = df1.drop(["study"], axis=1).add_prefix("rnaseq.") - df2 = select_features(gene_df, 'study', study_2) + df2 = select_features(gene_df, "study", study_2) # add namespace prefix as required by Uno - df2 = df2.drop(['study'], axis=1).add_prefix("rnaseq.") + df2 = df2.drop(["study"], axis=1).add_prefix("rnaseq.") - dr_df = select_features(drug_df, 'SOURCE', study_1) + dr_df = select_features(drug_df, "SOURCE", study_1) if drug_ids is not None: - dr_df = dr_df[dr_df['DRUG_ID'].isin(drug_ids)] + dr_df = dr_df[dr_df["DRUG_ID"].isin(drug_ids)] # keep only drug response data of cell lines in data1 dr_df = dr_df.iloc[np.where(np.isin(dr_df.CELLNAME, df1.index))[0], :] @@ -68,19 +88,20 @@ def coxen_feature_selection(study_1, study_2, correlation_cutoff, # perform the first step of COXEN approach to select predictive genes. To avoid exceeding the memory limit, # the prediction power of genes (i.e. absolute correlation coefficient with drug response) is calculated in batches. gid1 = xcorr.correlation_feature_selection(df1, dr_df.GROWTH.values, - dr_df.CELLNAME, correlation_cutoff) + dr_df.CELLNAME, + correlation_cutoff) # keep only predictive genes for data1 and data2 df1 = df1.iloc[:, gid1] df2 = df2.iloc[:, gid1] gid2 = xcorr.cross_correlation_feature_selection(df1.values, df2.values, - cross_correlation_cutoff) + cross_correlation_cutoff) genes = df1.columns[gid2] if output_file is not None: - with open(output_file, 'w') as f_out: + with open(output_file, "w") as f_out: for g in genes: - f_out.write('{}\n'.format(g)) + f_out.write("{}\n".format(g)) return genes diff --git a/workflows/xcorr/xcorr.py b/workflows/xcorr/xcorr.py index 34d603fa..c489d04a 100644 --- a/workflows/xcorr/xcorr.py +++ b/workflows/xcorr/xcorr.py @@ -1,9 +1,12 @@ -import numpy as np import os +import numpy as np + + def correlation_feature_selection(data, targets, labels, cutoff): - """ - Use Pearson correlation coefficient to select predictive features for regression. + """Use Pearson correlation coefficient to select predictive features for + regression. + :param data: an data table, where rows are samples and columns are features :param label: sample labels of data, which match with targets :param targets: a vector of real numbers indicating the regression targets, with a length the same as labels. @@ -14,13 +17,17 @@ def correlation_feature_selection(data, targets, labels, cutoff): """ batchSize = 100 - numBatch = int(np.ceil(data.shape[1]/batchSize)) + numBatch = int(np.ceil(data.shape[1] / batchSize)) cor = np.empty((data.shape[1], 1)) for i in range(numBatch): - startIndex = i*batchSize - endIndex = min((i+1)*batchSize, data.shape[1]) - cor_i = np.corrcoef(np.vstack((np.transpose(data.iloc[:, startIndex:endIndex].loc[labels, - :].values), np.reshape(targets, (1, len(targets)))))) + startIndex = i * batchSize + endIndex = min((i + 1) * batchSize, data.shape[1]) + cor_i = np.corrcoef( + np.vstack(( + np.transpose( + data.iloc[:, startIndex:endIndex].loc[labels, :].values), + np.reshape(targets, (1, len(targets))), + ))) cor[startIndex:endIndex, 0] = abs(cor_i[:-1, -1]) if cutoff < 1: gid = np.where(cor >= cutoff)[0] @@ -31,13 +38,12 @@ def correlation_feature_selection(data, targets, labels, cutoff): def cross_correlation_feature_selection(data1, data2, cutoff): - """ - Use the COXEN approach to select the features that are generalizable between data1 and data2. - data1 and data2 should have an equal number of features. The features in data1 and data2 should - match. - + """Use the COXEN approach to select the features that are generalizable + between data1 and data2. data1 and data2 should have an equal number of + features. The features in data1 and data2 should match. + :param data1: an array, where rows are samples and columns are features - :param data2: an array, where rows are samples and columns are features. + :param data2: an array, where rows are samples and columns are features. :param cutoff: a positive number for selecting generalizable features. If cutoff < 1, this function selects the features with a correlation coefficient >= cutoff. If cutoff >= 1, it must be an integer indicating the number of features to be selected based on correlation coefficient. @@ -48,12 +54,15 @@ def cross_correlation_feature_selection(data1, data2, cutoff): num = data1.shape[1] cor = [] for i in range(num): - cor.append(np.corrcoef(np.vstack((list(cor1[:i, i]) + list(cor1[(i + 1):, i]), - list(cor2[:i, i]) + list(cor2[(i + 1):, i]))))[0, 1]) + cor.append( + np.corrcoef( + np.vstack(( + list(cor1[:i, i]) + list(cor1[(i + 1):, i]), + list(cor2[:i, i]) + list(cor2[(i + 1):, i]), + )))[0, 1]) cor = np.array(cor) if cutoff < 1: fid = np.where(cor >= cutoff)[0] else: fid = np.argsort(-cor)[:int(cutoff)] return sorted(fid) - diff --git a/workflows/xcorr/xcorr_db.py b/workflows/xcorr/xcorr_db.py index 10b123fa..6c08ab7a 100644 --- a/workflows/xcorr/xcorr_db.py +++ b/workflows/xcorr/xcorr_db.py @@ -1,4 +1,3 @@ - # XCORR DB PY # DB helper functions @@ -8,33 +7,33 @@ import sqlite3 import sys + def setup_db(db_file): - if 'DB' not in globals(): - rank = os.getenv('PMIX_RANK') - print('rank %s Connecting to DB...' % rank) + if "DB" not in globals(): + rank = os.getenv("PMIX_RANK") + print("rank %s Connecting to DB..." % rank) global DB DB = xcorr_db(db_file) DB.read_feature_names() DB.read_study_names() return DB + class xcorr_db: def __init__(self, db_file, log=False): - """ - Sets up a wrapper around the SQL connection and cursor objects - Also caches dicts that convert between names and ids for the - features and studies tables - """ - #self.conn = sqlite3.connect(db_file) - #self.cursor = self.conn.cursor() + """Sets up a wrapper around the SQL connection and cursor objects Also + caches dicts that convert between names and ids for the features and + studies tables.""" + # self.conn = sqlite3.connect(db_file) + # self.cursor = self.conn.cursor() self.db_file = db_file self.feature_id2name = None self.feature_name2id = None - self.study_id2name = None - self.study_name2id = None - self.autoclose = True - self.logger = None # Default + self.study_id2name = None + self.study_name2id = None + self.autoclose = True + self.logger = None # Default if log: logging.basicConfig(format="SQL: %(message)s") self.logger = logging.getLogger("xcorr_db") @@ -47,54 +46,66 @@ def connect(self): # provisional for cp1 runs def insert_hpo_record(self, record_id): - ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") self.connect() with self.conn: - hpo_id = self.insert(table='hpos', names=['xcorr_record_id', 'time'], - values = [str(record_id), q(ts)]) + hpo_id = self.insert( + table="hpos", + names=["xcorr_record_id", "time"], + values=[str(record_id), q(ts)], + ) self.commit() return hpo_id def insert_hpo_run(self, hpo_id, param_string, run_directory): - ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") self.connect() with self.conn: - run_id = self.insert(table='hpo_runs', names=['hpoid', 'params', 'run_directory', 'start'], - values = [str(hpo_id), q(param_string), q(run_directory), q(ts)]) + run_id = self.insert( + table="hpo_runs", + names=["hpoid", "params", "run_directory", "start"], + values=[str(hpo_id), + q(param_string), + q(run_directory), + q(ts)], + ) self.commit() return run_id def update_hpo_run(self, run_id, result): - ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") sql = "update hpo_runs set obj_result = ?, end = ? where runid = ?" self.connect() with self.conn: self.cursor.execute(sql, (result, ts, run_id)) self.commit() - def insert_xcorr_record(self, studies, features, - cutoff_corr, cutoff_xcorr): - """ - Insert a new XCORR record. + def insert_xcorr_record(self, studies, features, cutoff_corr, cutoff_xcorr): + """Insert a new XCORR record. + :return: The ID of the new record """ - ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') - names = [ "time", "cutoff_corr", "cutoff_xcorr" ] - values = [ q(ts), str(cutoff_corr), str(cutoff_xcorr) ] + ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + names = ["time", "cutoff_corr", "cutoff_xcorr"] + values = [q(ts), str(cutoff_corr), str(cutoff_xcorr)] self.connect() with self.conn: record_id = self.insert("records", names, values) for feature in features: feature_id = str(self.feature_name2id[feature]) - self.insert(table="features", - names=[ "record_id", "feature_id"], - values=[ record_id , feature_id ]) + self.insert( + table="features", + names=["record_id", "feature_id"], + values=[record_id, feature_id], + ) for study in studies: study_id = str(self.study_name2id[study]) - self.insert(table="studies", - names=[ "record_id", "study_id"], - values=[ record_id , study_id ]) + self.insert( + table="studies", + names=["record_id", "study_id"], + values=[record_id, study_id], + ) self.commit() self.log("inserted record: " + record_id) return record_id @@ -104,11 +115,13 @@ def scan_features_file(self, filename): with open(filename) as fp: while True: line = fp.readline() - if line == "": break + if line == "": + break tokens = line.split("#") line = tokens[0] line = line.strip() - if line == "": continue + if line == "": + continue line = line.replace("rnaseq.", "") results.append(line) return results @@ -122,10 +135,11 @@ def read_feature_names(self): self.feature_name2id = {} while True: row = self.cursor.fetchone() - if row == None: break + if row == None: + break rowid, name = row[0:2] self.feature_id2name[rowid] = name - self.feature_name2id[name] = rowid + self.feature_name2id[name] = rowid return self.feature_id2name, self.feature_name2id @@ -138,17 +152,19 @@ def read_study_names(self): self.study_name2id = {} while True: row = self.cursor.fetchone() - if row == None: break + if row == None: + break rowid, name = row[0:2] self.study_id2name[rowid] = name - self.study_name2id[name] = rowid + self.study_name2id[name] = rowid return self.study_id2name, self.study_name2id def insert(self, table, names, values): - """ Do a SQL insert """ - names_tpl = sql_tuple(names) + """Do a SQL insert.""" + names_tpl = sql_tuple(names) values_tpl = sql_tuple(values) - cmd = "insert into {} {} values {};".format(table, names_tpl, values_tpl) + cmd = "insert into {} {} values {};".format(table, names_tpl, + values_tpl) self.execute(cmd) rowid = str(self.cursor.lastrowid) return rowid @@ -180,19 +196,22 @@ def __del__(self): def q(s): - """ Quote the given string """ + """Quote the given string.""" return "'" + str(s) + "'" + def qL(L): - """ Quote each list entry as a string """ + """Quote each list entry as a string.""" return map(q, L) + def qA(*args): - """ Quote each argument as a string """ + """Quote each argument as a string.""" return map(q, args) + def sql_tuple(L): - """ Make the given list into a SQL-formatted tuple """ + """Make the given list into a SQL-formatted tuple.""" result = "" result += "(" result += ",".join(L)