|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +import csv |
| 4 | +import sys |
| 5 | +import os |
| 6 | + |
| 7 | +import numpy as np |
| 8 | + |
| 9 | +import pandas as pd |
| 10 | +import matplotlib.pyplot as plt |
| 11 | + |
| 12 | + |
| 13 | +def read_csv(fn, has_header=True, data_type=str): |
| 14 | + """ |
| 15 | + Function which reads the csv files containing trajectories |
| 16 | + of the auto-sklearn runs. |
| 17 | + """ |
| 18 | + data = list() |
| 19 | + header = None |
| 20 | + with open(fn, 'r') as csvfile: |
| 21 | + csv_reader = csv.reader(csvfile, delimiter=',', quotechar='|') |
| 22 | + for row in csv_reader: |
| 23 | + if header is None and has_header: |
| 24 | + header = row |
| 25 | + continue |
| 26 | + data.append(list(map(data_type, [i.strip() for i in row]))) |
| 27 | + return header, data |
| 28 | + |
| 29 | + |
| 30 | +def fill_trajectory(performance_list, time_list): |
| 31 | + # Create n series objects. |
| 32 | + series_list = [] |
| 33 | + for n in range(len(time_list)): |
| 34 | + series_list.append(pd.Series(data=performance_list[n], index=time_list[n])) |
| 35 | + |
| 36 | + # Concatenate to one Series with NaN vales. |
| 37 | + series = pd.concat(series_list, axis=1) |
| 38 | + |
| 39 | + # Fill missing performance values (NaNs) with last non-NaN value. |
| 40 | + series = series.fillna(method='ffill') |
| 41 | + |
| 42 | + # return the trajectories over seeds (series object) |
| 43 | + return series |
| 44 | + |
| 45 | + |
| 46 | +def main(): |
| 47 | + # name of the file where the plot is stored |
| 48 | + saveto = "../plot.png" |
| 49 | + # runtime of each experiment |
| 50 | + max_runtime = 3600 |
| 51 | + # folder where all trajectories are stored. |
| 52 | + working_directory = "../log_output" |
| 53 | + |
| 54 | + # list of models |
| 55 | + model_list = ['vanilla', 'ensemble', 'metalearning', 'meta_ensemble'] |
| 56 | + |
| 57 | + # list of seeds |
| 58 | + seed_dir = os.path.join(working_directory, 'vanilla') |
| 59 | + seed_list = [seed for seed in os.listdir(seed_dir)] |
| 60 | + |
| 61 | + # list of tasks |
| 62 | + vanilla_task_dir = os.path.join(seed_dir, seed_list[0]) |
| 63 | + task_list = [task_id for task_id in os.listdir(vanilla_task_dir)] |
| 64 | + |
| 65 | + # Step 1. Merge all trajectories into one Dataframe object. |
| 66 | + ##################################################################################### |
| 67 | + all_trajectories = [] |
| 68 | + |
| 69 | + for model in model_list: |
| 70 | + trajectories = [] |
| 71 | + for task_id in task_list: |
| 72 | + csv_files = [] |
| 73 | + |
| 74 | + for seed in seed_list: |
| 75 | + # collect all csv files of different seeds for current model and |
| 76 | + # current task. |
| 77 | + if model in ['vanilla', 'ensemble']: |
| 78 | + csv_file = os.path.join(working_directory, |
| 79 | + 'vanilla', |
| 80 | + seed, |
| 81 | + task_id, |
| 82 | + "score_{}.csv".format(model) |
| 83 | + ) |
| 84 | + |
| 85 | + elif model in ['metalearning', 'meta_ensemble']: |
| 86 | + csv_file = os.path.join(working_directory, |
| 87 | + 'metalearning', |
| 88 | + seed, |
| 89 | + task_id, |
| 90 | + "score_{}.csv".format(model), |
| 91 | + ) |
| 92 | + csv_files.append(csv_file) |
| 93 | + |
| 94 | + performance_list = [] |
| 95 | + time_list = [] |
| 96 | + |
| 97 | + # Get data from csv |
| 98 | + for fl in csv_files: |
| 99 | + _, csv_data = read_csv(fl, has_header=True) |
| 100 | + csv_data = np.array(csv_data) |
| 101 | + # Replace too high values with args.maxsize |
| 102 | + data = [min([sys.maxsize, float(i.strip())]) for i in |
| 103 | + csv_data[:, 2]] # test trajectories are stored in third column |
| 104 | + |
| 105 | + time_steps = [float(i.strip()) for i in csv_data[:, 0]] |
| 106 | + assert time_steps[0] == 0 |
| 107 | + |
| 108 | + performance_list.append(data) |
| 109 | + time_list.append(time_steps) |
| 110 | + |
| 111 | + # trajectory is the pd.Series object containing all seed runs of the |
| 112 | + # current model and current task. |
| 113 | + trajectory = fill_trajectory(performance_list, time_list) |
| 114 | + trajectories.append(trajectory) |
| 115 | + |
| 116 | + # list[list[pd.Series]] |
| 117 | + all_trajectories.append(trajectories) |
| 118 | + |
| 119 | + # Step 2. Compute average ranks of the trajectories. |
| 120 | + ##################################################################################### |
| 121 | + all_rankings = [] |
| 122 | + n_iter = 500 # number of bootstrap samples to use for estimating the ranks. |
| 123 | + n_tasks = len(task_list) |
| 124 | + |
| 125 | + for i in range(n_iter): |
| 126 | + pick = np.random.choice(all_trajectories[0][0].shape[1], |
| 127 | + size=(len(model_list))) |
| 128 | + |
| 129 | + for j in range(n_tasks): |
| 130 | + all_trajectories_tmp = pd.DataFrame( |
| 131 | + {model_list[k]: at[j].iloc[:, pick[k]] for |
| 132 | + k, at in enumerate(all_trajectories)} |
| 133 | + ) |
| 134 | + all_trajectories_tmp = all_trajectories_tmp.fillna(method='ffill', axis=0) |
| 135 | + r_tmp = all_trajectories_tmp.rank(axis=1) |
| 136 | + all_rankings.append(r_tmp) |
| 137 | + |
| 138 | + final_ranks = [] |
| 139 | + for i, model in enumerate(model_list): |
| 140 | + ranks_for_model = [] |
| 141 | + for ranking in all_rankings: |
| 142 | + ranks_for_model.append(ranking.loc[:, model]) |
| 143 | + ranks_for_model = pd.DataFrame(ranks_for_model) |
| 144 | + ranks_for_model = ranks_for_model.fillna(method='ffill', axis=1) |
| 145 | + final_ranks.append(ranks_for_model.mean(skipna=True)) |
| 146 | + |
| 147 | + # Step 3. Plot the average ranks over time. |
| 148 | + ##################################################################################### |
| 149 | + for i, model in enumerate(model_list): |
| 150 | + X_data = [] |
| 151 | + y_data = [] |
| 152 | + for x, y in final_ranks[i].iteritems(): |
| 153 | + X_data.append(x) |
| 154 | + y_data.append(y) |
| 155 | + X_data.append(max_runtime) |
| 156 | + y_data.append(y) |
| 157 | + plt.plot(X_data, y_data, label=model) |
| 158 | + plt.xlabel('time [sec]') |
| 159 | + plt.ylabel('average rank') |
| 160 | + plt.legend() |
| 161 | + plt.savefig(saveto) |
| 162 | + |
| 163 | + |
| 164 | +if __name__ == "__main__": |
| 165 | + main() |
0 commit comments