|
| 1 | +import pandas as pd |
| 2 | +import glob |
| 3 | +from subprocess import Popen |
| 4 | +import yaml |
| 5 | +import os |
| 6 | +import math |
| 7 | +import numpy as np |
| 8 | +from random import shuffle |
| 9 | +from joblib import Parallel, delayed |
| 10 | +import multiprocessing |
| 11 | + |
| 12 | +import matplotlib |
| 13 | +matplotlib.use('Agg') |
| 14 | +import matplotlib.pylab as plt |
| 15 | + |
| 16 | +import pdb |
| 17 | + |
| 18 | +def arrangeTrialsAtRandom(filenames,scale=1.0): |
| 19 | + shuffle(filenames) |
| 20 | + previous = pd.read_csv(filenames[0]) |
| 21 | + previous['times'] = previous['times'].apply(lambda x: x/60.0/scale) |
| 22 | + dataframes = [previous] |
| 23 | + for filename in filenames[1:]: |
| 24 | + shift = max(previous['times'].values) |
| 25 | + current = pd.read_csv(filename) |
| 26 | + current['times'] = current['times'].apply(lambda x: x/60.0/scale+shift) |
| 27 | + dataframes.append(current) |
| 28 | + previous = current |
| 29 | + return pd.concat(dataframes) |
| 30 | + |
| 31 | +def getOneBestValidationAUC(T_of_test,dataset): |
| 32 | + #select subset of dataframe by time for all |
| 33 | + dataset = dataset[dataset.times <= T_of_test] |
| 34 | + |
| 35 | + #apply emulate_converge script |
| 36 | + aucs = dataset['val_roc'].values |
| 37 | + if len(aucs) > 0: |
| 38 | + return max(aucs) |
| 39 | + else: |
| 40 | + return 0.0 |
| 41 | + |
| 42 | +def doPlot(parallel_aucs, serial_aucs, times, errors): |
| 43 | + times = list(times) |
| 44 | + times_histo = np.histogram(parallel_aucs,bins=times) |
| 45 | + #values,edges = times_histo |
| 46 | + parallel_values = parallel_aucs[1:] |
| 47 | + edges = times |
| 48 | + print(len(parallel_values)) |
| 49 | + print(len(edges)) |
| 50 | + serial_values = np.array(serial_aucs[1:]) |
| 51 | + errors = np.array(errors[1:]) |
| 52 | + edges = np.array(times[:-1]) |
| 53 | + print(errors.shape) |
| 54 | + print(edges.shape) |
| 55 | + print(serial_values.shape) |
| 56 | + |
| 57 | + |
| 58 | + plt.figure() |
| 59 | + plt.plot(edges, parallel_values,label = "Distributed search") #, width=np.diff(edges), ec="k", align="edge") |
| 60 | + plt.plot(edges, serial_values, label="Sequential search") #, width=np.diff(edges), ec="k", align="edge") |
| 61 | + #plt.fill_between(edges, serial_values-errors,serial_values+errors) |
| 62 | + plt.legend(loc = (0.6,0.7)) |
| 63 | + plt.xlabel("Time [minutes]", fontsize=20) |
| 64 | + #plt.yscale('log') |
| 65 | + plt.ylabel('Best validation AUC', fontsize=20) |
| 66 | + plt.savefig("times.png") |
| 67 | + |
| 68 | + plt.figure() |
| 69 | + plt.plot(edges, parallel_values,label = "Distributed search") #, width=np.diff(edges), ec="k", align="edge") |
| 70 | + plt.plot(edges, serial_values, label="Sequential search") #, width=np.diff(edges), ec="k", align="edge") |
| 71 | + #plt.fill_between(edges, serial_values-errors,serial_values+errors) |
| 72 | + plt.legend(loc = (0.6,0.7)) |
| 73 | + plt.xlabel("Time [minutes]", fontsize=20) |
| 74 | + plt.xscale('log') |
| 75 | + plt.xlim([0,100]) |
| 76 | + plt.ylabel('Best validation AUC', fontsize=20) |
| 77 | + plt.savefig("times_logx_start.png") |
| 78 | + |
| 79 | + plt.figure() |
| 80 | + plt.plot(edges, parallel_values,label = "Distributed search") #, width=np.diff(edges), ec="k", align="edge") |
| 81 | + plt.plot(edges, serial_values, label="Sequential search") #, width=np.diff(edges), ec="k", align="edge") |
| 82 | + #plt.fill_between(edges, serial_values-errors,serial_values+errors) |
| 83 | + plt.legend(loc = (0.6,0.7)) |
| 84 | + plt.xlabel("Time [minutes]", fontsize=20) |
| 85 | + plt.xscale('log') |
| 86 | + plt.xlim([100,10000]) |
| 87 | + plt.ylabel('Best validation AUC', fontsize=20) |
| 88 | + plt.savefig("times_logx.png") |
| 89 | + |
| 90 | + |
| 91 | +def getReplica(filenames, times): |
| 92 | + serial_auc_replica = arrangeTrialsAtRandom(filenames,100.0) |
| 93 | + |
| 94 | + best_serial_aucs_over_time = [] |
| 95 | + for T in times: |
| 96 | + current_best = 0 |
| 97 | + ##pass AUCs and real epoch counts to emulate_converge |
| 98 | + auc = getOneBestValidationAUC(T,serial_auc_replica) |
| 99 | + if auc > current_best: current_best = auc |
| 100 | + |
| 101 | + best_serial_aucs_over_time.append(current_best) |
| 102 | + |
| 103 | + #replicas.append(best_serial_aucs_over_time) |
| 104 | + return best_serial_aucs_over_time |
| 105 | + |
| 106 | +def getTimeReplica(filenames,T): |
| 107 | + current_best = 0 |
| 108 | + for filename in filenames: |
| 109 | + #get AUCs for this trial, one per effective epoch |
| 110 | + try: |
| 111 | + dataset = pd.read_csv(filename) |
| 112 | + dataset['times'] = dataset['times'].apply(lambda x: x/60.0) |
| 113 | + except: |
| 114 | + print("No data in {}".format(filename)) |
| 115 | + continue |
| 116 | + ##pass AUCs and real epoch counts to emulate_converge |
| 117 | + auc = getOneBestValidationAUC(T,dataset) |
| 118 | + if auc > current_best: current_best = auc |
| 119 | + return current_best |
| 120 | + |
| 121 | +def getTimeReplicaSerial(serial_auc_replica,T): |
| 122 | + current_best = 0 |
| 123 | + ##pass AUCs and real epoch counts to emulate_converge |
| 124 | + auc = getOneBestValidationAUC(T,serial_auc_replica) |
| 125 | + if auc > current_best: current_best = auc |
| 126 | + |
| 127 | + #replicas.append(best_serial_aucs_over_time) |
| 128 | + return current_best |
| 129 | + |
| 130 | + |
| 131 | +if __name__ == '__main__': |
| 132 | + |
| 133 | + filenames = glob.glob("/tigress/FRNN/JET_Titan_hyperparameter_run/*/temporal_csv_log.csv") |
| 134 | + patience = 5 |
| 135 | + |
| 136 | + times = np.linspace(0,310*30,186*30) |
| 137 | + |
| 138 | + best_parallel_aucs_over_time = [] |
| 139 | + num_cores = multiprocessing.cpu_count() |
| 140 | + print ("Running on ", num_cores, " CPU cores") |
| 141 | + best_parallel_aucs_over_time = Parallel(n_jobs=num_cores)(delayed(getTimeReplica)(filenames, T) for T in times) |
| 142 | + |
| 143 | + Nreplicas = 20 |
| 144 | + replicas = [] |
| 145 | + |
| 146 | + |
| 147 | + for i in range(Nreplicas): |
| 148 | + serial_auc_replica = arrangeTrialsAtRandom(filenames,100.0) |
| 149 | + |
| 150 | + #replicas = Parallel(n_jobs=num_cores)(delayed(getReplica)(filenames, times) for i in range(Nreplicas)) |
| 151 | + best_serial_aucs_over_time = Parallel(n_jobs=num_cores)(delayed(getTimeReplicaSerial)(serial_auc_replica, T) for T in times) |
| 152 | + replicas.append(best_serial_aucs_over_time) |
| 153 | + |
| 154 | + |
| 155 | + from statistics import mean,stdev |
| 156 | + best_serial_aucs_over_time = list(map(mean, zip(*replicas))) |
| 157 | + errors = list(map(stdev, zip(*replicas))) |
| 158 | + |
| 159 | + doPlot(best_parallel_aucs_over_time, best_serial_aucs_over_time, times, errors) |
0 commit comments