Skip to content

Commit eaa3629

Browse files
committed
Extract time to solution plots
1 parent 44461cd commit eaa3629

File tree

1 file changed

+159
-0
lines changed

1 file changed

+159
-0
lines changed

examples/extract_best_overtime.py

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
import pandas as pd
2+
import glob
3+
from subprocess import Popen
4+
import yaml
5+
import os
6+
import math
7+
import numpy as np
8+
from random import shuffle
9+
from joblib import Parallel, delayed
10+
import multiprocessing
11+
12+
import matplotlib
13+
matplotlib.use('Agg')
14+
import matplotlib.pylab as plt
15+
16+
import pdb
17+
18+
def arrangeTrialsAtRandom(filenames,scale=1.0):
19+
shuffle(filenames)
20+
previous = pd.read_csv(filenames[0])
21+
previous['times'] = previous['times'].apply(lambda x: x/60.0/scale)
22+
dataframes = [previous]
23+
for filename in filenames[1:]:
24+
shift = max(previous['times'].values)
25+
current = pd.read_csv(filename)
26+
current['times'] = current['times'].apply(lambda x: x/60.0/scale+shift)
27+
dataframes.append(current)
28+
previous = current
29+
return pd.concat(dataframes)
30+
31+
def getOneBestValidationAUC(T_of_test,dataset):
32+
#select subset of dataframe by time for all
33+
dataset = dataset[dataset.times <= T_of_test]
34+
35+
#apply emulate_converge script
36+
aucs = dataset['val_roc'].values
37+
if len(aucs) > 0:
38+
return max(aucs)
39+
else:
40+
return 0.0
41+
42+
def doPlot(parallel_aucs, serial_aucs, times, errors):
43+
times = list(times)
44+
times_histo = np.histogram(parallel_aucs,bins=times)
45+
#values,edges = times_histo
46+
parallel_values = parallel_aucs[1:]
47+
edges = times
48+
print(len(parallel_values))
49+
print(len(edges))
50+
serial_values = np.array(serial_aucs[1:])
51+
errors = np.array(errors[1:])
52+
edges = np.array(times[:-1])
53+
print(errors.shape)
54+
print(edges.shape)
55+
print(serial_values.shape)
56+
57+
58+
plt.figure()
59+
plt.plot(edges, parallel_values,label = "Distributed search") #, width=np.diff(edges), ec="k", align="edge")
60+
plt.plot(edges, serial_values, label="Sequential search") #, width=np.diff(edges), ec="k", align="edge")
61+
#plt.fill_between(edges, serial_values-errors,serial_values+errors)
62+
plt.legend(loc = (0.6,0.7))
63+
plt.xlabel("Time [minutes]", fontsize=20)
64+
#plt.yscale('log')
65+
plt.ylabel('Best validation AUC', fontsize=20)
66+
plt.savefig("times.png")
67+
68+
plt.figure()
69+
plt.plot(edges, parallel_values,label = "Distributed search") #, width=np.diff(edges), ec="k", align="edge")
70+
plt.plot(edges, serial_values, label="Sequential search") #, width=np.diff(edges), ec="k", align="edge")
71+
#plt.fill_between(edges, serial_values-errors,serial_values+errors)
72+
plt.legend(loc = (0.6,0.7))
73+
plt.xlabel("Time [minutes]", fontsize=20)
74+
plt.xscale('log')
75+
plt.xlim([0,100])
76+
plt.ylabel('Best validation AUC', fontsize=20)
77+
plt.savefig("times_logx_start.png")
78+
79+
plt.figure()
80+
plt.plot(edges, parallel_values,label = "Distributed search") #, width=np.diff(edges), ec="k", align="edge")
81+
plt.plot(edges, serial_values, label="Sequential search") #, width=np.diff(edges), ec="k", align="edge")
82+
#plt.fill_between(edges, serial_values-errors,serial_values+errors)
83+
plt.legend(loc = (0.6,0.7))
84+
plt.xlabel("Time [minutes]", fontsize=20)
85+
plt.xscale('log')
86+
plt.xlim([100,10000])
87+
plt.ylabel('Best validation AUC', fontsize=20)
88+
plt.savefig("times_logx.png")
89+
90+
91+
def getReplica(filenames, times):
92+
serial_auc_replica = arrangeTrialsAtRandom(filenames,100.0)
93+
94+
best_serial_aucs_over_time = []
95+
for T in times:
96+
current_best = 0
97+
##pass AUCs and real epoch counts to emulate_converge
98+
auc = getOneBestValidationAUC(T,serial_auc_replica)
99+
if auc > current_best: current_best = auc
100+
101+
best_serial_aucs_over_time.append(current_best)
102+
103+
#replicas.append(best_serial_aucs_over_time)
104+
return best_serial_aucs_over_time
105+
106+
def getTimeReplica(filenames,T):
107+
current_best = 0
108+
for filename in filenames:
109+
#get AUCs for this trial, one per effective epoch
110+
try:
111+
dataset = pd.read_csv(filename)
112+
dataset['times'] = dataset['times'].apply(lambda x: x/60.0)
113+
except:
114+
print("No data in {}".format(filename))
115+
continue
116+
##pass AUCs and real epoch counts to emulate_converge
117+
auc = getOneBestValidationAUC(T,dataset)
118+
if auc > current_best: current_best = auc
119+
return current_best
120+
121+
def getTimeReplicaSerial(serial_auc_replica,T):
122+
current_best = 0
123+
##pass AUCs and real epoch counts to emulate_converge
124+
auc = getOneBestValidationAUC(T,serial_auc_replica)
125+
if auc > current_best: current_best = auc
126+
127+
#replicas.append(best_serial_aucs_over_time)
128+
return current_best
129+
130+
131+
if __name__ == '__main__':
132+
133+
filenames = glob.glob("/tigress/FRNN/JET_Titan_hyperparameter_run/*/temporal_csv_log.csv")
134+
patience = 5
135+
136+
times = np.linspace(0,310*30,186*30)
137+
138+
best_parallel_aucs_over_time = []
139+
num_cores = multiprocessing.cpu_count()
140+
print ("Running on ", num_cores, " CPU cores")
141+
best_parallel_aucs_over_time = Parallel(n_jobs=num_cores)(delayed(getTimeReplica)(filenames, T) for T in times)
142+
143+
Nreplicas = 20
144+
replicas = []
145+
146+
147+
for i in range(Nreplicas):
148+
serial_auc_replica = arrangeTrialsAtRandom(filenames,100.0)
149+
150+
#replicas = Parallel(n_jobs=num_cores)(delayed(getReplica)(filenames, times) for i in range(Nreplicas))
151+
best_serial_aucs_over_time = Parallel(n_jobs=num_cores)(delayed(getTimeReplicaSerial)(serial_auc_replica, T) for T in times)
152+
replicas.append(best_serial_aucs_over_time)
153+
154+
155+
from statistics import mean,stdev
156+
best_serial_aucs_over_time = list(map(mean, zip(*replicas)))
157+
errors = list(map(stdev, zip(*replicas)))
158+
159+
doPlot(best_parallel_aucs_over_time, best_serial_aucs_over_time, times, errors)

0 commit comments

Comments
 (0)