Skip to content

Commit 2150150

Browse files
authored
Merge pull request #40 from PPPLDeepLearning/hotfix/warnings
Fix bugs from jdev branch merge, suppress new TensorFlow warnings, reindex epochs, improve diagnostics
2 parents d013980 + df88d1a commit 2150150

24 files changed

+852
-664
lines changed

.gitignore

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@
77
# Generated by test
88
plot_*.html
99

10+
# Outputs from analysis scripts
11+
*.png
12+
out.txt
13+
*.npz
14+
1015
# Byte-compiled / optimized / DLL files
1116
__pycache__/
1217
*.py[cod]
@@ -95,3 +100,20 @@ ENV/
95100

96101
# Rope project settings
97102
.ropeproject
103+
104+
# Job scheduler output
105+
################
106+
# Slurm
107+
*.out
108+
109+
# Cobalt
110+
*.output
111+
*.error
112+
*.cobaltlog
113+
114+
# PBS
115+
# *.o*
116+
# *.e*
117+
118+
# Etc
119+
*.local

.travis.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
language: python
2-
2+
branches:
3+
only:
4+
- master
35
os:
46
- linux
57

data/signals.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from __future__ import print_function
2+
import plasma.global_vars as g
23
import numpy as np
34
import sys
45

@@ -57,27 +58,27 @@ def get_units(str):
5758
found = True
5859

5960
except Exception as e:
60-
print(e)
61+
g.print_unique(e)
6162
sys.stdout.flush()
6263
pass
6364

6465
# Retrieve data from PTDATA if node not found
6566
if not found:
66-
# print("not in full path {}".format(signal))
67+
# g.print_unique("not in full path {}".format(signal))
6768
data = c.get('_s = ptdata2("'+signal+'",'+str(shot)+')').data()
6869
if len(data) != 1:
6970
rank = np.ndim(data)
7071
found = True
7172
# Retrieve data from Pseudo-pointname if not in ptdata
7273
if not found:
73-
# print("not in PTDATA {}".format(signal))
74+
# g.print_unique("not in PTDATA {}".format(signal))
7475
data = c.get('_s = pseudo("'+signal+'",'+str(shot)+')').data()
7576
if len(data) != 1:
7677
rank = np.ndim(data)
7778
found = True
7879
# this means the signal wasn't found
7980
if not found:
80-
print("No such signal: {}".format(signal))
81+
g.print_unique("No such signal: {}".format(signal))
8182
pass
8283

8384
# get time base
@@ -125,7 +126,7 @@ def fetch_jet_data(signal_path, shot_num, c):
125126
signal_path, shot_num)).data()
126127
found = True
127128
except Exception as e:
128-
print(e)
129+
g.print_unique(e)
129130
sys.stdout.flush()
130131
# pass
131132
return time, data, ydata, found
@@ -361,8 +362,9 @@ def fetch_nstx_data(signal_path, shot_num, c):
361362

362363
all_signals_restricted = all_signals
363364

364-
print('all signals (determines which signals are downloaded & preprocessed):')
365-
print(all_signals.values())
365+
g.print_unique('All signals (determines which signals are downloaded'
366+
' & preprocessed):')
367+
g.print_unique(all_signals.values())
366368

367369
fully_defined_signals = {
368370
sig_name: sig for (sig_name, sig) in all_signals_restricted.items() if (

examples/conf.yaml

Lines changed: 55 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,129 +1,132 @@
1-
#conf.py will parse the yaml and extract parameters based on what is specified
1+
# conf.py will parse the yaml and extract parameters based on what is specified
22

3-
#will do stuff in fs_path / [username] / signal_data | shot_lists | processed shots, etc.
3+
# will do stuff in fs_path / [username] / signal_data | shot_lists | processed shots, etc.
44

55
fs_path: '/tigress'
6-
target: 'hinge' #'maxhinge' #'maxhinge' #'binary' #'hinge'
7-
num_gpus: 4
6+
target: 'hinge' # 'maxhinge' # 'maxhinge' # 'binary' # 'hinge'
7+
num_gpus: 4 # per node
88

99
paths:
10-
signal_prepath: '/signal_data/' #/signal_data/jet/
10+
signal_prepath: '/signal_data/' # /signal_data/jet/
1111
shot_list_dir: '/shot_lists/'
1212
tensorboard_save_path: '/Graph/'
13-
data: d3d_data_0D #'d3d_to_jet_data' #'d3d_to_jet_data' # 'jet_to_d3d_data' #jet_data
14-
specific_signals: [] #['q95','li','ip','betan','energy','lm','pradcore','pradedge','pradtot','pin','torquein','tmamp1','tmamp2','tmfreq1','tmfreq2','pechin','energydt','ipdirect','etemp_profile','edens_profile'] #if left empty will use all valid signals defined on a machine. Only use if need a custom set
13+
data: d3d_data_0D # 'd3d_to_jet_data' # 'd3d_to_jet_data' # 'jet_to_d3d_data' # jet_data
14+
# if specific_signals: [] left empty, it will use all valid signals defined on a machine. Only use if need a custom set
15+
specific_signals: [] # ['q95','li','ip','betan','energy','lm','pradcore','pradedge','pradtot','pin','torquein','tmamp1','tmamp2','tmfreq1','tmfreq2','pechin','energydt','ipdirect','etemp_profile','edens_profile']
1516
executable: "mpi_learn.py"
1617
shallow_executable: "learn.py"
1718

1819
data:
19-
bleed_in: 0 #how many shots from the test sit to use in training?
20-
bleed_in_repeat_fac: 1 #how many times to repeat shots in training and validation?
20+
bleed_in: 0 # how many shots from the test sit to use in training?
21+
bleed_in_repeat_fac: 1 # how many times to repeat shots in training and validation?
2122
bleed_in_remove_from_test: True
2223
bleed_in_equalize_sets: False
23-
signal_to_augment: None #'plasma current' #or None
24+
# TODO(KGF): make next parameter use 'none' instead of None
25+
signal_to_augment: None # 'plasma current' # or None
2426
augmentation_mode: 'none'
2527
augment_during_training: False
2628
cut_shot_ends: True
2729
T_min_warn: 30
2830
recompute: False
2931
recompute_normalization: False
30-
#specifies which of the signals in the signals_dirs order contains the plasma current info
32+
# specifies which of the signals in the signals_dirs order contains the plasma current info
3133
current_index: 0
3234
plotting: False
33-
#train/validate split
34-
#how many shots to use
35-
use_shots: 200000 #1000 #200000
36-
positive_example_penalty: 1.0 #by what factor to upweight positive examples?
37-
#normalization timescale
35+
# how many shots to use
36+
use_shots: 200000 # 1000 # 200000
37+
positive_example_penalty: 1.0 # by what factor to upweight positive examples?
38+
# normalization timescale
3839
dt: 0.001
39-
#maximum TTD considered
40+
# maximum TTD considered
4041
T_max: 1000.0
41-
#The shortest works best so far: less overfitting. log TTd prediction also works well. 0.5 better than 0.2
42-
T_warning: 1.024 #1.024 #1.024 #0.512 #0.25 #1.0 #1.0 #warning time in seconds
42+
# The shortest works best so far: less overfitting. log TTd prediction also works well. 0.5 better than 0.2
43+
T_warning: 1.024 # 1.024 # 1.024 # 0.512 # 0.25 # 1.0 # 1.0 # warning time in seconds
4344
current_thresh: 750000
4445
current_end_thresh: 10000
45-
#the characteristic decay length of the decaying moving average window
46+
# the characteristic decay length of the decaying moving average window
4647
window_decay: 2
47-
#the width of the actual window
48+
# the width of the actual window
4849
window_size: 10
49-
#TODO optimize
50+
# TODO(KGF): optimize the normalizer parameters
5051
normalizer: 'var'
5152
norm_stat_range: 100.0
5253
equalize_classes: False
53-
# shallow_sample_prob: 0.01 #the fraction of samples with which to train the shallow model
54+
# shallow_sample_prob: 0.01 # the fraction of samples with which to train the shallow model
5455
floatx: 'float32'
5556

5657
model:
5758
loss_scale_factor: 1.0
5859
use_batch_norm: false
5960
torch: False
60-
shallow: True
61+
shallow: False
6162
shallow_model:
62-
num_samples: 1000000 #1000000 #the number of samples to use for training
63-
type: "xgboost" #"xgboost" #"xgboost" #"random_forest" "xgboost"
64-
n_estimators: 100 #for random forest
65-
max_depth: 3 #for random forest and xgboost (def = 3)
66-
C: 1.0 #for svm
67-
kernel: "rbf" #rbf, sigmoid, linear, poly, for svm
68-
learning_rate: 0.1 #xgboost
69-
scale_pos_weight: 10.0 #xgboost
70-
final_hidden_layer_size: 10 #final layers has this many neurons, every layer before twice as many
63+
num_samples: 1000000 # 1000000 # the number of samples to use for training
64+
type: "xgboost" # "xgboost" #"random_forest"
65+
n_estimators: 100 # for random forest
66+
max_depth: 3 # for random forest and xgboost (def = 3)
67+
C: 1.0 # for svm
68+
kernel: "rbf" # rbf, sigmoid, linear, poly, for svm
69+
learning_rate: 0.1 # used in xgboost
70+
scale_pos_weight: 10.0 # used in xgboost
71+
final_hidden_layer_size: 10 # final layers has this many neurons, every layer before twice as many
7172
num_hidden_layers: 3
7273
learning_rate_mlp: 0.0001
7374
mlp_regularization: 0.0001
74-
skip_train: False #should a finished model be loaded if available
75-
#length of LSTM memory
75+
skip_train: False # should a finished model be loaded if available
76+
# length of LSTM memory
7677
pred_length: 200
7778
pred_batch_size: 128
78-
#TODO optimize
79+
# TODO(KGF): optimize length of LSTM memory
7980
length: 128
8081
skip: 1
81-
#hidden layer size
82-
#TODO optimize
82+
# hidden layer size
83+
# TODO(KGF): optimize size of RNN layers
8384
rnn_size: 200
84-
#size 100 slight overfitting, size 20 no overfitting. 200 is not better than 100. Prediction much better with size 100, size 20 cannot capture the data.
85+
# size 100 slight overfitting, size 20 no overfitting. 200 is not better than 100. Prediction much better with size 100, size 20 cannot capture the data.
8586
rnn_type: 'LSTM'
86-
#TODO optimize
87+
# TODO(KGF): optimize number of RNN layers
8788
rnn_layers: 2
8889
num_conv_filters: 128
8990
size_conv_filters: 3
9091
num_conv_layers: 3
9192
pool_size: 2
9293
dense_size: 128
9394
extra_dense_input: False
94-
#have not found a difference yet
95+
# have not found a difference yet
9596
optimizer: 'adam'
9697
clipnorm: 10.0
9798
regularization: 0.001
9899
dense_regularization: 0.001
99-
#1e-4 is too high, 5e-7 is too low. 5e-5 seems best at 256 batch size, full dataset and ~10 epochs, and lr decay of 0.90. 1e-4 also works well if we decay a lot (i.e ~0.7 or more)
100-
lr: 0.00002 #0.00001 #0.0005 #for adam plots 0.0000001 #0.00005 #0.00005 #0.00005
101-
lr_decay: 0.97 #0.98 #0.9
100+
# lr=1e-4 is too high, 5e-7 is too low. 5e-5 seems best at 256 batch size, full dataset
101+
# and ~10 epochs, and lr decay of 0.90
102+
# lr=1e-4 also works well if we decay a lot (i.e ~0.7 or more)
103+
lr: 0.00002 # 0.00001 # 0.0005 # for adam plots 0.0000001 # 0.00005 # 0.00005 # 0.00005
104+
lr_decay: 0.97 # 0.98 # 0.9
102105
stateful: True
103106
return_sequences: True
104107
dropout_prob: 0.1
105-
#only relevant if we want to do mpi training. The number of steps with a single replica
108+
# only relevant if we want to do MPI training. The number of steps with a single replica
106109
warmup_steps: 0
107-
ignore_timesteps: 100 #how many initial timesteps to ignore during evaluation (to let the internal state settle)
110+
ignore_timesteps: 100 # how many initial timesteps to ignore during evaluation (to let the internal state settle)
108111
backend: 'tensorflow'
109112
training:
110113
as_array_of_shots: True
111114
shuffle_training: True
112115
train_frac: 0.75
113116
validation_frac: 0.33
114-
batch_size: 128 #256
115-
#THIS WAS THE CULPRIT FOR NO TRAINING! Lower than 1000 performs very poorly
117+
batch_size: 128 # 256
118+
# THE MAX_PATCH_LENGTH WAS THE CULPRIT FOR NO TRAINING! Lower than 1000 performs very poorly
116119
max_patch_length: 100000
117-
#How many shots are we loading at once?
120+
# How many shots are we loading at once?
118121
num_shots_at_once: 200
119-
num_epochs: 1000
122+
num_epochs: 1000 # large number = maximum number of epochs. Early stopping will occur if loss does not decrease
120123
use_mock_data: False
121124
data_parallel: False
122125
hyperparam_tuning: False
123126
batch_generator_warmup_steps: 0
124127
use_process_generator: False
125-
num_batches_minimum: 20 #minimum number of batches per epoch
126-
ranking_difficulty_fac: 1.0 #how much to upweight incorrectly classified shots during training
128+
num_batches_minimum: 20 # minimum number of batches per epoch
129+
ranking_difficulty_fac: 1.0 # how much to upweight incorrectly classified shots during training
127130
callbacks:
128131
list: ['earlystop']
129132
metrics: ['val_loss','val_roc','train_loss']

0 commit comments

Comments
 (0)