From 0b59a7a8bf57b768ac8cc7a1359025e8a242341c Mon Sep 17 00:00:00 2001 From: Techercise Date: Wed, 23 Jun 2021 06:43:56 -0700 Subject: [PATCH 1/5] Adding CoriGPU run files and modifying conf.yaml --- examples/conf.yaml | 8 ++++---- examples/corigpu_1GPU_slurm.cmd | 15 +++++++++++++++ examples/corigpu_4GPU_slurm.cmd | 19 +++++++++++++++++++ 3 files changed, 38 insertions(+), 4 deletions(-) create mode 100644 examples/corigpu_1GPU_slurm.cmd create mode 100644 examples/corigpu_4GPU_slurm.cmd diff --git a/examples/conf.yaml b/examples/conf.yaml index 0b996b7..3e19118 100644 --- a/examples/conf.yaml +++ b/examples/conf.yaml @@ -4,7 +4,7 @@ # will do stuff in fs_path / [username] / signal_data | shot_lists | processed shots, etc. -fs_path: '/tigress' +fs_path: '/global/cscratch1/sd/' target: 'hinge' # 'maxhinge' # 'maxhinge' # 'binary' # 'hinge' num_gpus: 4 # per node paths: @@ -127,7 +127,7 @@ training: num_shots_at_once: 200 # large number = maximum number of epochs. # Early stopping will occur if loss does not decrease, after some patience # of epochs - num_epochs: 1000 + num_epochs: 50 use_mock_data: False data_parallel: False hyperparam_tuning: False @@ -136,8 +136,8 @@ training: num_batches_minimum: 20 # minimum number of batches per epoch ranking_difficulty_fac: 1.0 # how much to upweight incorrectly classified shots during training timeline_prof: False - step_limit: 50 - no_validation: True + step_limit: 0 + no_validation: False callbacks: list: ['earlystop'] metrics: ['val_loss','val_roc','train_loss'] diff --git a/examples/corigpu_1GPU_slurm.cmd b/examples/corigpu_1GPU_slurm.cmd new file mode 100644 index 0000000..86dd160 --- /dev/null +++ b/examples/corigpu_1GPU_slurm.cmd @@ -0,0 +1,15 @@ +#!/bin/bash +#SBATCH -C gpu +#SBATCH -t 01:30:00 +#SBATCH -G 1 +#SBATCH -c 4 +#SBATCH --exclusive + +# rm /global/cscratch1/sd/$USER/model_checkpoints/* +# rm /global/cscratch1/sd/$USER/results/* +# rm /global/cscratch1/sd/$USER/csv_logs/* +# rm /global/cscratch1/sd/$USER/Graph/* +# rm /global/cscratch1/sd/$USER/normalization/* + +export OMPI_MCA_btl="tcp,self,vader" +srun python mpi_learn.py diff --git a/examples/corigpu_4GPU_slurm.cmd b/examples/corigpu_4GPU_slurm.cmd new file mode 100644 index 0000000..6ec0ed8 --- /dev/null +++ b/examples/corigpu_4GPU_slurm.cmd @@ -0,0 +1,19 @@ +#!/bin/bash +#SBATCH -C gpu +#SBATCH -t 02:00:00 +#SBATCH -N 4 +#SBATCH -G 4 +#SBATCH --ntasks-per-node=4 +#SBATCH --ntasks-per-socket=2 +#SBATCH -c 4 +#SBATCH --mem-per-cpu=0 +#SBATCH --exclusive + +# rm /global/cscratch1/sd/$USER/model_checkpoints/* +# rm /global/cscratch1/sd/$USER/results/* +# rm /global/cscratch1/sd/$USER/csv_logs/* +# rm /global/cscratch1/sd/$USER/Graph/* +# rm /global/cscratch1/sd/$USER/normalization/* + +export OMPI_MCA_btl="tcp,self,vader" +srun python mpi_learn.py From a0f05167d450b852674562b41e7f1848c4f3c957 Mon Sep 17 00:00:00 2001 From: Techercise Date: Fri, 20 Aug 2021 13:44:37 -0400 Subject: [PATCH 2/5] Manually add OLCF-AMD documentation to master branch --- docs/OLCF-AMD.md | 296 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 296 insertions(+) create mode 100644 docs/OLCF-AMD.md diff --git a/docs/OLCF-AMD.md b/docs/OLCF-AMD.md new file mode 100644 index 0000000..c8c2632 --- /dev/null +++ b/docs/OLCF-AMD.md @@ -0,0 +1,296 @@ +# OLCF Spock Tutorial +*Last updated 2021-8-19* + +*This document is built off of the excellent how-to guide created for [Princeton's TigerGPU](https://github.com/Techercise/plasma-python/blob/master/docs/PrincetonUTutorial.md)* + +## Building the package +### Login to Spock + +First, login to the Spock headnode via ssh: +``` +ssh -X @spock.olcf.ornl.gov +``` +Note, `-X` is optional; it is only necessary if you are planning on performing remote visualization, e.g. the output `.png` files from the below [section](#Learning-curves-and-ROC-per-epoch). Trusted X11 forwarding can be used with `-Y` instead of `-X` and may prevent timeouts, but it disables X11 SECURITY extension controls. + +### Sample installation on Spock + +#### Check out the Code Repository +Next, check out the source code from github: +``` +git clone https://github.com/PPPLDeepLearning/plasma-python +cd plasma-python +``` + +#### Install Miniconda +At the time of writing, Anaconda and Miniconda are not installed on Spock, therefore one of them must be manually downloaded. In their system documentation, AMD recommends downloading Miniconda. + +To install Miniconda, download the Linux installer [here](https://docs.conda.io/en/latest/miniconda.html#linux-installers) and follow the installation instructions for Miniconda on [this page](https://conda.io/projects/conda/en/latest/user-guide/install/linux.html) + +Once Miniconda is installed, create a conda environment: +``` +conda create -n your_env_name python=3.8 -y +``` + +Then, activate the environment: +``` +conda activate your_env_name +``` + +Ensure the following packages are installed in your conda environment: +``` +pyyaml # pip install pyyaml +pathos # pip install pathos +hyperopt # pip install hyperopt +matplotlib # pip install matplotlib +keras # pip install keras +tensorflow-rocm # pip install tensorflow-rocm +``` + +#### Modules +In order to load the correct modules with ease, creating a profile is recommended +``` +vim frnn_spock.profile +``` + +Write the following to the profile: +``` +module load rocm +module load cray-python +module load gcc +module load craype-accel-amd-gfx908 +module load cray-mpich/8.1.7 +module use /sw/aaims/spock/modulefiles +module load tensorflow + +# These must be set before running if wanting to use the Cray GPU-Aware MPI +# If running on only 1 GPU, there is no need to uncomment these lines + +# export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 +# export MPICH_GPU_SUPPORT_ENABLED=1 +# export HIPCC_COMPILE_FLAGS_APPEND="$HIPCC_COMPILE_FLAGS_APPEND -I${MPICH_DIR}/include -L${MPICH_DIR}/lib -lmpi -L/opt/cray/pe/mpich/8.1.7/gtl/lib -lmpi_gtl_hsa" + +export MPICC="$(which mpicc)" +``` + + +As of the latest update of this document (Summer 2021), the above modules correspond to the following versions on the Spock system, given by `module list` (Note that this list also includes the default system modules): +``` +Currently Loaded Modules: + 1) craype/2.7.8 3) libfabric/1.11.0.4.75 5) cray-dsmml/0.1.5 7) xpmem/2.2.40-2.1_2.28__g3cf3325.shasta 9) cray-pmi/6.0.12 11) DefApps/default 13) cray-python/3.8.5.1 15) craype-accel-amd-gfx908 17) rocm/4.1.0 + 2) craype-x86-rome 4) craype-network-ofi 6) perftools-base/21.05.0 8) cray-libsci/21.06.1.1 10) cray-pmi-lib/6.0.12 12) PrgEnv-cray/8.1.0 14) gcc/10.3.0 16) cray-mpich/8.1.7 18) tensorflow/2.3.6 +``` + +#### Build mpi4py +If wanting to run on multiple GPUs, mpi4py is needed. At the time of writing, a manual installation of mpi4py is needed on the Spock system. To install mpi4py, do the following: +``` +# Ensure your conda environment is activated: +conda activate your_env_name + +# Download mpi4py to your home directory +#cd ~ +curl -O -L https://bitbucket.org/mpi4py/mpi4py/downloads/mpi4py-3.0.3.tar.gz + +# Untar the file +tar -xzvf mpi4py-3.0.3.tar.gz + +cd mpi4py-3.0.3 + +# Edit the mpi.cfg file +vim mpi.cfg +``` + +Include the following segment in the mpi.cfg file: +``` + [craympi] + mpi_dir = /opt/cray/pe/mpich/8.1.4/ofi/crayclang/9.1 + mpicc = cc + mpicxx = CC + include_dirs = /opt/cray/pe/mpich/8.1.4/ofi/crayclang/9.1/include + libraries = mpi + library_dirs = /opt/cray/pe/mpich/8.1.4/ofi/crayclang/9.1/ +``` + +Build and install mpi4py: +``` +python setup.py build --mpi=craympi +python setup.py install +``` + +Next, install the `plasma-python` package: + +```bash +#conda activate your_env_name +#cd ~/plasma-python +python setup.py install +``` + +## Understanding and preparing the input data +### Location of the data on Spock + +**Currently, no public data exists on Spock, but we leave this section in here for the user to understand the input data** + +The JET and D3D datasets contain multi-modal time series of sensory measurements leading up to deleterious events called plasma disruptions. The datasets are located in the `/tigress/FRNN` project directory of the [GPFS](https://www.ibm.com/support/knowledgecenter/en/SSPT3X_3.0.0/com.ibm.swg.im.infosphere.biginsights.product.doc/doc/bi_gpfs_overview.html) filesystem on Princeton University clusters. + +For convenience, create following symbolic links: +```bash +cd /tigress/ +ln -s /tigress/FRNN/shot_lists shot_lists +ln -s /tigress/FRNN/signal_data signal_data +``` + +### Configuring the dataset +All the configuration parameters are summarised in `examples/conf.yaml`. In this section, we highlight the important ones used to control the input data. + +Currently, FRNN is capable of working with JET and D3D data as well as thecross-machine regime. The switch is done in the configuration file: +```yaml +paths: + ... + data: 'jet_0D' +``` + +Older yaml files kept for archival purposes will denote this data set as follow: +```yaml +paths: + ... + data: 'jet_data_0D' +``` +use `d3d_data` for D3D signals, use `jet_to_d3d_data` ir `d3d_to_jet_data` for cross-machine regime. + +By default, FRNN will select, preprocess, and normalize all valid signals available in the above dataset. To chose only specific signals use: +```yaml +paths: + ... + specific_signals: [q95,ip] +``` +if left empty `[]` will use all valid signals defined on a machine. Only set this variable if you need a custom set of signals. + +Other parameters configured in the `conf.yaml` include batch size, learning rate, neural network topology and special conditions foir hyperparameter sweeps. + +### Preprocessing the input data +***Preprocessing the input data is currently not required on Spock as the data that is available is already preprocessed.*** + +```bash +cd examples/ +python guarantee_preprocessed.py +``` +This will preprocess the data and save rescaled copies of the signals in `/tigress//processed_shots`, `/tigress//processed_shotlists` and `/tigress//normalization` + +Preprocessing must be performed only once per each dataset. For example, consider the following dataset specified in the config file `examples/conf.yaml`: +```yaml +paths: + data: jet_0D +``` +Preprocessing this dataset takes about 20 minutes to preprocess in parallel and can normally be done on the cluster headnode. + +### Current signals and notations + +Signal name | Description +--- | --- +q95 | q95 safety factor +ip | plasma current +li | internal inductance +lm | Locked mode amplitude +dens | Plasma density +energy | stored energy +pin | Input Power (beam for d3d) +pradtot | Radiated Power +pradcore | Radiated Power Core +pradedge | Radiated Power Edge +pechin | ECH input power, not always on +pechin | ECH input power, not always on +betan | Normalized Beta +energydt | stored energy time derivative +torquein | Input Beam Torque +tmamp1 | Tearing Mode amplitude (rotating 2/1) +tmamp2 | Tearing Mode amplitude (rotating 3/2) +tmfreq1 | Tearing Mode frequency (rotating 2/1) +tmfreq2 | Tearing Mode frequency (rotating 3/2) +ipdirect | plasma current direction + +## Training and inference + +Use the Slurm job scheduler to perform batch or interactive analysis on the Spock system. + +### Batch job + +A sample batch job script for 1 GPU is provided in the examples directory and is called spock_1GPU_slurm.cmd. It can be run using: `sbatch spock_1GPU_slurm.cmd` +Note that, the project/account (`-A`) and partition (`-p) arugments will need to reflect your project and assigned partition. + +Some batch job tips: +* For non-interactive batch analysis, make sure to allocate exactly 1 MPI process per GPU where `X` is the number of nodes for distibuted training and the total number of GPUs is `X * 4`. This configuration guarantees 1 MPI process per GPU, regardless of the value of `X`. +* Update the `num_gpus` value in `conf.yaml` to correspond to the total number of GPUs specified for your Slurm allocation. + +And monitor it's completion via: +```bash +squeue --me +``` +Optionally, add an email notification option in the Slurm configuration about the job completion: +``` +#SBATCH --mail-user=@email.com +#SBATCH --mail-type=ALL +``` + +### Interactive job + +Interactive option is preferred for **debugging** or running in the **notebook**, for all other case batch is preferred. +The workflow is to request an interactive session for a 1 GPU interactive job: + +```bash +salloc -t 02:00:00 -A -N 1 --gres=gpu:1 --exclusive -p --ntasks-per-socket=1 --ntasks-per-node=1 +``` + +[//]: # (Note, the modules might not/are not inherited from the shell that spawns the interactive Slurm session. Need to reload anaconda module, activate environment, and reload other compiler/library modules) + +Ensure the above modules are still loaded and reactivate your conda environmnt. +Then, launch the application from the command line: + +```bash +python mpi_learn.py +``` + +## Visualizing learning + +A regular FRNN run will produce several outputs and callbacks. + +## Custom visualization +You can visualize the accuracy of the trained FRNN model using the custom Python scripts and notebooks included in the repository. + +### Learning curves, example shots, and ROC per epoch + +You can produce the ROC curves for validation and test data as well as visualizations of shots by using: +``` +cd examples/ +python performance_analysis.py +``` +The `performance_analysis.py` script uses the file produced as a result of training the neural network as an input, and produces several `.png` files with plots as an output. + +In addition, you can check the scalar variable summaries for training loss, validation loss, and validation ROC logged at `/outputdir//csv_logs` (each run will produce a new log file with a timestamp in name). + +Sample notebooks for analyzing the files in this directory can be found in `examples/notebooks/`. For instance, the [LearningCurves.ipynb](https://github.com/PPPLDeepLearning/plasma-python/blob/master/examples/notebooks/LearningCurves.ipynb) notebook contains a variation on the following code snippet: +```python +import pandas as pd +import numpy as np +from bokeh.plotting import figure, show, output_file, save + +data = pd.read_csv("/csv_logs/.csv") + +from bokeh.io import output_notebook +output_notebook() + +from bokeh.models import Range1d +#optionally set the plotting range +#left, right, bottom, top = -0.1, 31, 0.005, 1.51 + +p = figure(title="Learning curve", y_axis_label="Training loss", x_axis_label='Epoch number') #,y_axis_type="log") +#p.set(x_range=Range1d(left, right), y_range=Range1d(bottom, top)) + +p.line(data['epoch'].values, data['train_loss'].values, legend="Test description", + line_color="tomato", line_dash="dotdash", line_width=2) +p.legend.location = "top_right" +show(p, notebook_handle=True) +``` +The resulting plot should match the `train_loss` plot in the Scalars tab of the TensorBoard summary. + +#### Learning curve summaries per mini-batch + +To extract per mini-batch summaries, we require a finer granularity of checkpoint data than what it is logged to the per-epoch lines of `csv_logs/` files. We must directly use the output produced by FRNN logged to the standard output stream. In the case of the non-interactive Slurm batch jobs, it will all be contained in the Slurm output file, e.g. `slurm-3842170.out`. Refer to the following notebook to perform the analysis of learning curve on a mini-batch level: [FRNN_scaling.ipynb](https://github.com/PPPLDeepLearning/plasma-python/blob/master/examples/notebooks/FRNN_scaling.ipynb) From ee4d8a81975c1ff942b604fa27fa7b21b7a8100e Mon Sep 17 00:00:00 2001 From: Techercise Date: Fri, 20 Aug 2021 13:54:49 -0400 Subject: [PATCH 3/5] Reset conf.yaml to default and remove WIP Cori job scripts --- examples/conf.yaml | 8 ++++---- examples/corigpu_1GPU_slurm.cmd | 15 --------------- examples/corigpu_4GPU_slurm.cmd | 19 ------------------- 3 files changed, 4 insertions(+), 38 deletions(-) delete mode 100644 examples/corigpu_1GPU_slurm.cmd delete mode 100644 examples/corigpu_4GPU_slurm.cmd diff --git a/examples/conf.yaml b/examples/conf.yaml index 3e19118..0b996b7 100644 --- a/examples/conf.yaml +++ b/examples/conf.yaml @@ -4,7 +4,7 @@ # will do stuff in fs_path / [username] / signal_data | shot_lists | processed shots, etc. -fs_path: '/global/cscratch1/sd/' +fs_path: '/tigress' target: 'hinge' # 'maxhinge' # 'maxhinge' # 'binary' # 'hinge' num_gpus: 4 # per node paths: @@ -127,7 +127,7 @@ training: num_shots_at_once: 200 # large number = maximum number of epochs. # Early stopping will occur if loss does not decrease, after some patience # of epochs - num_epochs: 50 + num_epochs: 1000 use_mock_data: False data_parallel: False hyperparam_tuning: False @@ -136,8 +136,8 @@ training: num_batches_minimum: 20 # minimum number of batches per epoch ranking_difficulty_fac: 1.0 # how much to upweight incorrectly classified shots during training timeline_prof: False - step_limit: 0 - no_validation: False + step_limit: 50 + no_validation: True callbacks: list: ['earlystop'] metrics: ['val_loss','val_roc','train_loss'] diff --git a/examples/corigpu_1GPU_slurm.cmd b/examples/corigpu_1GPU_slurm.cmd deleted file mode 100644 index 86dd160..0000000 --- a/examples/corigpu_1GPU_slurm.cmd +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -#SBATCH -C gpu -#SBATCH -t 01:30:00 -#SBATCH -G 1 -#SBATCH -c 4 -#SBATCH --exclusive - -# rm /global/cscratch1/sd/$USER/model_checkpoints/* -# rm /global/cscratch1/sd/$USER/results/* -# rm /global/cscratch1/sd/$USER/csv_logs/* -# rm /global/cscratch1/sd/$USER/Graph/* -# rm /global/cscratch1/sd/$USER/normalization/* - -export OMPI_MCA_btl="tcp,self,vader" -srun python mpi_learn.py diff --git a/examples/corigpu_4GPU_slurm.cmd b/examples/corigpu_4GPU_slurm.cmd deleted file mode 100644 index 6ec0ed8..0000000 --- a/examples/corigpu_4GPU_slurm.cmd +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -#SBATCH -C gpu -#SBATCH -t 02:00:00 -#SBATCH -N 4 -#SBATCH -G 4 -#SBATCH --ntasks-per-node=4 -#SBATCH --ntasks-per-socket=2 -#SBATCH -c 4 -#SBATCH --mem-per-cpu=0 -#SBATCH --exclusive - -# rm /global/cscratch1/sd/$USER/model_checkpoints/* -# rm /global/cscratch1/sd/$USER/results/* -# rm /global/cscratch1/sd/$USER/csv_logs/* -# rm /global/cscratch1/sd/$USER/Graph/* -# rm /global/cscratch1/sd/$USER/normalization/* - -export OMPI_MCA_btl="tcp,self,vader" -srun python mpi_learn.py From d8966f1209342b352de16cee1f071d62fe3717ea Mon Sep 17 00:00:00 2001 From: Techercise Date: Fri, 1 Oct 2021 11:57:31 -0400 Subject: [PATCH 4/5] Update OLCF-AMD.md to reflect @felker 's comments --- docs/OLCF-AMD.md | 176 ++--------------------------------------------- 1 file changed, 5 insertions(+), 171 deletions(-) diff --git a/docs/OLCF-AMD.md b/docs/OLCF-AMD.md index c8c2632..be9d3fc 100644 --- a/docs/OLCF-AMD.md +++ b/docs/OLCF-AMD.md @@ -1,7 +1,7 @@ # OLCF Spock Tutorial *Last updated 2021-8-19* -*This document is built off of the excellent how-to guide created for [Princeton's TigerGPU](https://github.com/Techercise/plasma-python/blob/master/docs/PrincetonUTutorial.md)* +*This document is built off of the excellent how-to guide created for [Princeton's TigerGPU](./PrincetonUTutorial.md)* ## Building the package ### Login to Spock @@ -47,9 +47,9 @@ tensorflow-rocm # pip install tensorflow-rocm ``` #### Modules -In order to load the correct modules with ease, creating a profile is recommended +In order to load the correct modules with ease, creating a profile is recommended. Create a profile named ``` -vim frnn_spock.profile +frnn_spock.profile ``` Write the following to the profile: @@ -96,7 +96,7 @@ tar -xzvf mpi4py-3.0.3.tar.gz cd mpi4py-3.0.3 # Edit the mpi.cfg file -vim mpi.cfg +`mpi.cfg` ``` Include the following segment in the mpi.cfg file: @@ -125,172 +125,6 @@ python setup.py install ``` ## Understanding and preparing the input data -### Location of the data on Spock -**Currently, no public data exists on Spock, but we leave this section in here for the user to understand the input data** +To learn how to understand and prepare the input data, please see the [corresponding section in the TigerGPU tutorial](./PrincetonUTutorial.md#understanding-and-preparing-the-input-data) -The JET and D3D datasets contain multi-modal time series of sensory measurements leading up to deleterious events called plasma disruptions. The datasets are located in the `/tigress/FRNN` project directory of the [GPFS](https://www.ibm.com/support/knowledgecenter/en/SSPT3X_3.0.0/com.ibm.swg.im.infosphere.biginsights.product.doc/doc/bi_gpfs_overview.html) filesystem on Princeton University clusters. - -For convenience, create following symbolic links: -```bash -cd /tigress/ -ln -s /tigress/FRNN/shot_lists shot_lists -ln -s /tigress/FRNN/signal_data signal_data -``` - -### Configuring the dataset -All the configuration parameters are summarised in `examples/conf.yaml`. In this section, we highlight the important ones used to control the input data. - -Currently, FRNN is capable of working with JET and D3D data as well as thecross-machine regime. The switch is done in the configuration file: -```yaml -paths: - ... - data: 'jet_0D' -``` - -Older yaml files kept for archival purposes will denote this data set as follow: -```yaml -paths: - ... - data: 'jet_data_0D' -``` -use `d3d_data` for D3D signals, use `jet_to_d3d_data` ir `d3d_to_jet_data` for cross-machine regime. - -By default, FRNN will select, preprocess, and normalize all valid signals available in the above dataset. To chose only specific signals use: -```yaml -paths: - ... - specific_signals: [q95,ip] -``` -if left empty `[]` will use all valid signals defined on a machine. Only set this variable if you need a custom set of signals. - -Other parameters configured in the `conf.yaml` include batch size, learning rate, neural network topology and special conditions foir hyperparameter sweeps. - -### Preprocessing the input data -***Preprocessing the input data is currently not required on Spock as the data that is available is already preprocessed.*** - -```bash -cd examples/ -python guarantee_preprocessed.py -``` -This will preprocess the data and save rescaled copies of the signals in `/tigress//processed_shots`, `/tigress//processed_shotlists` and `/tigress//normalization` - -Preprocessing must be performed only once per each dataset. For example, consider the following dataset specified in the config file `examples/conf.yaml`: -```yaml -paths: - data: jet_0D -``` -Preprocessing this dataset takes about 20 minutes to preprocess in parallel and can normally be done on the cluster headnode. - -### Current signals and notations - -Signal name | Description ---- | --- -q95 | q95 safety factor -ip | plasma current -li | internal inductance -lm | Locked mode amplitude -dens | Plasma density -energy | stored energy -pin | Input Power (beam for d3d) -pradtot | Radiated Power -pradcore | Radiated Power Core -pradedge | Radiated Power Edge -pechin | ECH input power, not always on -pechin | ECH input power, not always on -betan | Normalized Beta -energydt | stored energy time derivative -torquein | Input Beam Torque -tmamp1 | Tearing Mode amplitude (rotating 2/1) -tmamp2 | Tearing Mode amplitude (rotating 3/2) -tmfreq1 | Tearing Mode frequency (rotating 2/1) -tmfreq2 | Tearing Mode frequency (rotating 3/2) -ipdirect | plasma current direction - -## Training and inference - -Use the Slurm job scheduler to perform batch or interactive analysis on the Spock system. - -### Batch job - -A sample batch job script for 1 GPU is provided in the examples directory and is called spock_1GPU_slurm.cmd. It can be run using: `sbatch spock_1GPU_slurm.cmd` -Note that, the project/account (`-A`) and partition (`-p) arugments will need to reflect your project and assigned partition. - -Some batch job tips: -* For non-interactive batch analysis, make sure to allocate exactly 1 MPI process per GPU where `X` is the number of nodes for distibuted training and the total number of GPUs is `X * 4`. This configuration guarantees 1 MPI process per GPU, regardless of the value of `X`. -* Update the `num_gpus` value in `conf.yaml` to correspond to the total number of GPUs specified for your Slurm allocation. - -And monitor it's completion via: -```bash -squeue --me -``` -Optionally, add an email notification option in the Slurm configuration about the job completion: -``` -#SBATCH --mail-user=@email.com -#SBATCH --mail-type=ALL -``` - -### Interactive job - -Interactive option is preferred for **debugging** or running in the **notebook**, for all other case batch is preferred. -The workflow is to request an interactive session for a 1 GPU interactive job: - -```bash -salloc -t 02:00:00 -A -N 1 --gres=gpu:1 --exclusive -p --ntasks-per-socket=1 --ntasks-per-node=1 -``` - -[//]: # (Note, the modules might not/are not inherited from the shell that spawns the interactive Slurm session. Need to reload anaconda module, activate environment, and reload other compiler/library modules) - -Ensure the above modules are still loaded and reactivate your conda environmnt. -Then, launch the application from the command line: - -```bash -python mpi_learn.py -``` - -## Visualizing learning - -A regular FRNN run will produce several outputs and callbacks. - -## Custom visualization -You can visualize the accuracy of the trained FRNN model using the custom Python scripts and notebooks included in the repository. - -### Learning curves, example shots, and ROC per epoch - -You can produce the ROC curves for validation and test data as well as visualizations of shots by using: -``` -cd examples/ -python performance_analysis.py -``` -The `performance_analysis.py` script uses the file produced as a result of training the neural network as an input, and produces several `.png` files with plots as an output. - -In addition, you can check the scalar variable summaries for training loss, validation loss, and validation ROC logged at `/outputdir//csv_logs` (each run will produce a new log file with a timestamp in name). - -Sample notebooks for analyzing the files in this directory can be found in `examples/notebooks/`. For instance, the [LearningCurves.ipynb](https://github.com/PPPLDeepLearning/plasma-python/blob/master/examples/notebooks/LearningCurves.ipynb) notebook contains a variation on the following code snippet: -```python -import pandas as pd -import numpy as np -from bokeh.plotting import figure, show, output_file, save - -data = pd.read_csv("/csv_logs/.csv") - -from bokeh.io import output_notebook -output_notebook() - -from bokeh.models import Range1d -#optionally set the plotting range -#left, right, bottom, top = -0.1, 31, 0.005, 1.51 - -p = figure(title="Learning curve", y_axis_label="Training loss", x_axis_label='Epoch number') #,y_axis_type="log") -#p.set(x_range=Range1d(left, right), y_range=Range1d(bottom, top)) - -p.line(data['epoch'].values, data['train_loss'].values, legend="Test description", - line_color="tomato", line_dash="dotdash", line_width=2) -p.legend.location = "top_right" -show(p, notebook_handle=True) -``` -The resulting plot should match the `train_loss` plot in the Scalars tab of the TensorBoard summary. - -#### Learning curve summaries per mini-batch - -To extract per mini-batch summaries, we require a finer granularity of checkpoint data than what it is logged to the per-epoch lines of `csv_logs/` files. We must directly use the output produced by FRNN logged to the standard output stream. In the case of the non-interactive Slurm batch jobs, it will all be contained in the Slurm output file, e.g. `slurm-3842170.out`. Refer to the following notebook to perform the analysis of learning curve on a mini-batch level: [FRNN_scaling.ipynb](https://github.com/PPPLDeepLearning/plasma-python/blob/master/examples/notebooks/FRNN_scaling.ipynb) From 13181da50edea137ffa05b2382017c72a6de0176 Mon Sep 17 00:00:00 2001 From: Techercise Date: Fri, 1 Oct 2021 12:01:25 -0400 Subject: [PATCH 5/5] Correct the location of backticks on mpi.cfg --- docs/OLCF-AMD.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/OLCF-AMD.md b/docs/OLCF-AMD.md index be9d3fc..bf7666d 100644 --- a/docs/OLCF-AMD.md +++ b/docs/OLCF-AMD.md @@ -1,5 +1,5 @@ # OLCF Spock Tutorial -*Last updated 2021-8-19* +*Last updated 2021-10-1* *This document is built off of the excellent how-to guide created for [Princeton's TigerGPU](./PrincetonUTutorial.md)* @@ -96,10 +96,10 @@ tar -xzvf mpi4py-3.0.3.tar.gz cd mpi4py-3.0.3 # Edit the mpi.cfg file -`mpi.cfg` +mpi.cfg ``` -Include the following segment in the mpi.cfg file: +Include the following segment in the `mpi.cfg` file: ``` [craympi] mpi_dir = /opt/cray/pe/mpich/8.1.4/ofi/crayclang/9.1