|
| 1 | +import argparse |
| 2 | +import math |
| 3 | +import os |
| 4 | + |
| 5 | +import matplotlib |
| 6 | +import numpy as np |
| 7 | +import pandas as pd |
| 8 | + |
| 9 | +matplotlib.use("Agg") |
| 10 | + |
| 11 | +import matplotlib.pyplot as plt |
| 12 | +import tensorflow.keras as ke |
| 13 | +from sklearn.model_selection import train_test_split |
| 14 | +from sklearn.preprocessing import StandardScaler |
| 15 | +from tensorflow.keras import backend as K |
| 16 | +from tensorflow.keras.callbacks import ( |
| 17 | + CSVLogger, |
| 18 | + EarlyStopping, |
| 19 | + ModelCheckpoint, |
| 20 | + ReduceLROnPlateau, |
| 21 | +) |
| 22 | +from tensorflow.keras.layers import Dense, Dropout, Input |
| 23 | +from tensorflow.keras.models import Model, model_from_json, model_from_yaml |
| 24 | +from tensorflow.keras.optimizers import SGD |
| 25 | +import tensorflow as tf |
| 26 | + |
| 27 | +file_path = os.path.dirname(os.path.realpath(__file__)) |
| 28 | + |
| 29 | +strategy = tf.distribute.MirroredStrategy([ |
| 30 | + '/xpu:0' |
| 31 | +# ,'/xpu:1' |
| 32 | +# ,'/xpu:2' |
| 33 | +# ,'/xpu:3' |
| 34 | +# ,'/xpu:4' |
| 35 | +# ,'/xpu:5' |
| 36 | +# ,'/xpu:6','/xpu:7' |
| 37 | +# ,'/xpu:8','/xpu:9' |
| 38 | +# ,'/xpu:10','/xpu:11' |
| 39 | + ]) |
| 40 | +print('tensorflow version: {}'.format(tf.__version__)) |
| 41 | +print('Number of devices: {}'.format(strategy.num_replicas_in_sync)) |
| 42 | + |
| 43 | +# parse args |
| 44 | +psr = argparse.ArgumentParser(description="input csv file") |
| 45 | +psr.add_argument("--in", default="in_file") |
| 46 | +psr.add_argument("--ep", type=int, default=400) |
| 47 | +args = vars(psr.parse_args()) |
| 48 | + |
| 49 | +EPOCH = args["ep"] |
| 50 | +BATCH = 32 |
| 51 | +GLOBAL_BATCH_SIZE = BATCH * strategy.num_replicas_in_sync |
| 52 | +DR = 0.1 # Dropout rate |
| 53 | +data_path = args["in"] |
| 54 | +print(args) |
| 55 | + |
| 56 | +def r2(y_true, y_pred): |
| 57 | + SS_res = K.sum(K.square(y_true - y_pred)) |
| 58 | + SS_tot = K.sum(K.square(y_true - K.mean(y_true))) |
| 59 | + return 1 - SS_res / (SS_tot + K.epsilon()) |
| 60 | + |
| 61 | + |
| 62 | +def load_data(): |
| 63 | + |
| 64 | + data_path = args["in"] |
| 65 | + |
| 66 | + df = (pd.read_csv(data_path, skiprows=1).values).astype("float32") |
| 67 | + df_y = df[:, 0].astype("float32") |
| 68 | + df_x = df[:, 1:PL].astype(np.float32) |
| 69 | + print('df_y: {}\n{}'.format(df_y.shape, df_y)) |
| 70 | + print('df_x: {}\n{}'.format(df_x.shape, df_x)) |
| 71 | + |
| 72 | + scaler = StandardScaler() |
| 73 | + df_x = scaler.fit_transform(df_x) |
| 74 | + |
| 75 | + X_train, X_test, Y_train, Y_test = train_test_split( |
| 76 | + df_x, df_y, test_size=0.20, random_state=42 |
| 77 | + ) |
| 78 | + |
| 79 | + return X_train, Y_train, X_test, Y_test |
| 80 | + |
| 81 | +def load_data_from_parquet(): |
| 82 | + |
| 83 | + data_path = args["in"] |
| 84 | + |
| 85 | + df=pd.read_parquet(data_path) |
| 86 | + df_y = df['reg'].values.astype("float32") |
| 87 | + df_x = df.iloc[:,6:].values.astype("float32") |
| 88 | + print('df_y: {}\n{}'.format(df_y.shape, df_y)) |
| 89 | + print('df_x: {}\n{}'.format(df_x.shape, df_x)) |
| 90 | + |
| 91 | + scaler = StandardScaler() |
| 92 | + df_x = scaler.fit_transform(df_x) |
| 93 | + |
| 94 | + X_train, X_test, Y_train, Y_test = train_test_split( |
| 95 | + df_x, df_y, test_size=0.20, random_state=42 |
| 96 | + ) |
| 97 | + |
| 98 | + return X_train, Y_train, X_test, Y_test |
| 99 | + |
| 100 | + |
| 101 | +#X_train, Y_train, X_test, Y_test = load_data() |
| 102 | +X_train, Y_train, X_test, Y_test = load_data_from_parquet() |
| 103 | +print("X_train shape:", X_train.shape) |
| 104 | +print("X_test shape:", X_test.shape) |
| 105 | +print("Y_train_shape: ", Y_train.shape) |
| 106 | +print("Y_test shape: ", Y_test.shape) |
| 107 | + |
| 108 | +steps = X_train.shape[0]//GLOBAL_BATCH_SIZE |
| 109 | +validation_steps = X_test.shape[0]//GLOBAL_BATCH_SIZE |
| 110 | +print('samples {}, global_batch_size {}, steps {}'.format(X_train.shape[0], GLOBAL_BATCH_SIZE, steps)) |
| 111 | +print('val samples {}, global_batch_size {}, val_steps {}'.format(X_test.shape[0], GLOBAL_BATCH_SIZE, validation_steps)) |
| 112 | + |
| 113 | + |
| 114 | +train_ds = tf.data.Dataset.from_tensor_slices((X_train, Y_train)).batch(GLOBAL_BATCH_SIZE, |
| 115 | + drop_remainder=True, |
| 116 | + num_parallel_calls=None, |
| 117 | + deterministic=None, |
| 118 | + ).repeat(EPOCH) |
| 119 | +val_ds = tf.data.Dataset.from_tensor_slices((X_test, Y_test)).batch(GLOBAL_BATCH_SIZE, |
| 120 | + drop_remainder=True, |
| 121 | + num_parallel_calls=None, |
| 122 | + deterministic=None,).repeat(EPOCH) |
| 123 | + |
| 124 | +options = tf.data.Options() |
| 125 | +options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA |
| 126 | +train_ds = train_ds.with_options(options) |
| 127 | +val_ds = val_ds.with_options(options) |
| 128 | + |
| 129 | +train_dist = strategy.experimental_distribute_dataset(train_ds) |
| 130 | +val_dist = strategy.experimental_distribute_dataset(val_ds) |
| 131 | + |
| 132 | + |
| 133 | +with strategy.scope(): |
| 134 | + #inputs = Input(shape=(PS,)) |
| 135 | + inputs = Input(shape=(1826,)) |
| 136 | + x = Dense(250, activation="relu")(inputs) |
| 137 | + x = Dropout(DR)(x) |
| 138 | + x = Dense(125, activation="relu")(x) |
| 139 | + x = Dropout(DR)(x) |
| 140 | + x = Dense(60, activation="relu")(x) |
| 141 | + x = Dropout(DR)(x) |
| 142 | + x = Dense(30, activation="relu")(x) |
| 143 | + x = Dropout(DR)(x) |
| 144 | + outputs = Dense(1, activation="relu")(x) |
| 145 | + |
| 146 | + model = Model(inputs=inputs, outputs=outputs) |
| 147 | + model.summary() |
| 148 | + model.compile( |
| 149 | + loss="mean_squared_error", |
| 150 | + optimizer=SGD(lr=0.0001, momentum=0.9), |
| 151 | + metrics=["mae", r2], |
| 152 | + ) |
| 153 | + |
| 154 | +# set up a bunch of callbacks to do work during model training.. |
| 155 | + |
| 156 | +checkpointer = ModelCheckpoint( |
| 157 | + filepath="reg_go.autosave.model.h5", |
| 158 | + verbose=1, |
| 159 | + save_weights_only=False, |
| 160 | + save_best_only=True, |
| 161 | +) |
| 162 | +csv_logger = CSVLogger("reg_go.training.log") |
| 163 | +reduce_lr = ReduceLROnPlateau( |
| 164 | + monitor="val_loss", |
| 165 | + factor=0.75, |
| 166 | + patience=20, |
| 167 | + verbose=1, |
| 168 | + mode="auto", |
| 169 | + epsilon=0.0001, |
| 170 | + cooldown=3, |
| 171 | + min_lr=0.000000001, |
| 172 | +) |
| 173 | +early_stop = EarlyStopping(monitor="val_loss", patience=100, verbose=1, mode="auto") |
| 174 | + |
| 175 | +from datetime import datetime as dt |
| 176 | +print("{} calling model.fit".format(dt.fromtimestamp(dt.timestamp(dt.now())).strftime("%D %H:%M:%S.%s"))) |
| 177 | +history = model.fit( |
| 178 | + #X_train, |
| 179 | + #Y_train, |
| 180 | + train_dist, |
| 181 | + batch_size=GLOBAL_BATCH_SIZE, |
| 182 | + steps_per_epoch=int(steps), |
| 183 | + epochs=EPOCH, |
| 184 | + verbose=1, |
| 185 | + #validation_data=(X_test, Y_test), |
| 186 | + validation_data=val_dist, |
| 187 | + validation_steps=validation_steps, |
| 188 | + callbacks=[checkpointer, csv_logger, reduce_lr, early_stop], |
| 189 | +) |
| 190 | +print("{} done calling model.fit".format(dt.fromtimestamp(dt.timestamp(dt.now())).strftime("%D %H:%M:%S.%s"))) |
| 191 | + |
| 192 | + |
| 193 | +score = model.evaluate(X_test, Y_test, verbose=0) |
| 194 | + |
| 195 | +print(score) |
| 196 | + |
| 197 | +print(history.history.keys()) |
| 198 | +# dict_keys(['val_loss', 'val_mae', 'val_r2', 'loss', 'mae', 'r2', 'lr']) |
| 199 | + |
| 200 | +# summarize history for MAE |
| 201 | +# plt.plot(history.history['mean_absolute_error']) |
| 202 | +plt.plot(history.history["mae"]) |
| 203 | +# plt.plot(history.history['val_mean_absolute_error']) |
| 204 | +plt.plot(history.history["val_mae"]) |
| 205 | + |
| 206 | +plt.title("Model Mean Absolute Error") |
| 207 | +plt.ylabel("mae") |
| 208 | +plt.xlabel("epoch") |
| 209 | +plt.legend(["train", "test"], loc="upper left") |
| 210 | + |
| 211 | +plt.savefig("reg_go.mae.png", bbox_inches="tight") |
| 212 | +plt.savefig("reg_go.mae.pdf", bbox_inches="tight") |
| 213 | + |
| 214 | +plt.close() |
| 215 | + |
| 216 | +# summarize history for loss |
| 217 | +plt.plot(history.history["loss"]) |
| 218 | +plt.plot(history.history["val_loss"]) |
| 219 | +plt.title("Model Loss") |
| 220 | +plt.ylabel("loss") |
| 221 | +plt.xlabel("epoch") |
| 222 | +plt.legend(["train", "test"], loc="upper left") |
| 223 | + |
| 224 | +plt.savefig("reg_go.loss.png", bbox_inches="tight") |
| 225 | +plt.savefig("reg_go.loss.pdf", bbox_inches="tight") |
| 226 | + |
| 227 | +plt.close() |
| 228 | + |
| 229 | +print("Test val_loss:", score[0]) |
| 230 | +print("Test val_mae:", score[1]) |
| 231 | + |
| 232 | +# serialize model to JSON |
| 233 | +model_json = model.to_json() |
| 234 | +with open("reg_go.model.json", "w") as json_file: |
| 235 | + json_file.write(model_json) |
| 236 | + |
| 237 | +# serialize weights to HDF5 |
| 238 | +model.save_weights("reg_go.model.h5") |
| 239 | +print("Saved model to disk") |
| 240 | + |
| 241 | +# load json and create model |
| 242 | +json_file = open("reg_go.model.json", "r") |
| 243 | +loaded_model_json = json_file.read() |
| 244 | +json_file.close() |
| 245 | +loaded_model_json = model_from_json(loaded_model_json) |
| 246 | + |
| 247 | +# load weights into new model |
| 248 | +loaded_model_json.load_weights("reg_go.model.h5") |
| 249 | +print("Loaded json model from disk") |
| 250 | + |
| 251 | +# evaluate json loaded model on test data |
| 252 | +loaded_model_json.compile( |
| 253 | + loss="mean_squared_error", optimizer="SGD", metrics=["mean_absolute_error"] |
| 254 | +) |
| 255 | +score_json = loaded_model_json.evaluate(X_test, Y_test, verbose=0) |
| 256 | + |
| 257 | +print("json Validation loss:", score_json[0]) |
| 258 | +print("json Validation mae:", score_json[1]) |
| 259 | + |
| 260 | +predict_json_train = loaded_model_json.predict(X_train) |
| 261 | + |
| 262 | +predict_json_test = loaded_model_json.predict(X_test) |
| 263 | + |
| 264 | +pred_train = predict_json_train[:, 0] |
| 265 | +pred_test = predict_json_test[:, 0] |
| 266 | + |
| 267 | +np.savetxt("pred_train.csv", pred_train, delimiter=".", newline="\n", fmt="%.3f") |
| 268 | +np.savetxt("pred_test.csv", pred_test, delimiter=",", newline="\n", fmt="%.3f") |
| 269 | + |
| 270 | +print("Correlation prediction on test and Y_test:", np.corrcoef(pred_test, Y_test)) |
| 271 | +print("Correlation prediction on train and Y_train:", np.corrcoef(pred_train, Y_train)) |
0 commit comments