Skip to content

Commit 1c6af59

Browse files
authored
Merge pull request #678 from automl/development
Release 0.5.2
2 parents f39c8d3 + f236ded commit 1c6af59

File tree

17 files changed

+889
-65
lines changed

17 files changed

+889
-65
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ docs/build/*
44
*.py[cod]
55

66
# C extensions
7+
*.c
78
*.so
89

910
# Packages
@@ -46,3 +47,5 @@ download
4647
*.pkl
4748
num_run
4849
number_submission
50+
.pypirc
51+
dmypy.json

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ matrix:
3333
- os: linux
3434
env: DISTRIB="conda" COVERAGE="true" DOCPUSH="true" PYTHON="3.6"
3535
- os: linux
36-
env: DISTRIB="conda" $TEST_DIST="true" PYTHON="3.7"
36+
env: DISTRIB="conda" TEST_DIST="true" PYTHON="3.7"
3737
- os: linux
3838
env: DISTRIB="conda" EXAMPLES="true" PYTHON=3.7"
3939
- os: linux

autosklearn/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
"""Version information."""
22

33
# The following line *must* be the last in the module, exactly as formatted:
4-
__version__ = "0.5.1"
4+
__version__ = "0.5.2"

autosklearn/automl.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1057,6 +1057,9 @@ def predict_proba(self, X, batch_size=None, n_jobs=1):
10571057

10581058

10591059
class AutoMLRegressor(BaseAutoML):
1060+
def __init__(self, *args, **kwargs):
1061+
super().__init__(*args, **kwargs)
1062+
10601063
def fit(
10611064
self,
10621065
X: np.ndarray,

autosklearn/ensemble_builder.py

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -257,13 +257,15 @@ def read_ensemble_preds(self):
257257

258258
if self.shared_mode is False:
259259
pred_path = os.path.join(
260-
self.dir_ensemble,
261-
'predictions_ensemble_%s_*.npy' % self.seed)
260+
glob.escape(self.dir_ensemble),
261+
'predictions_ensemble_%s_*.npy' % self.seed,
262+
)
262263
# pSMAC
263264
else:
264265
pred_path = os.path.join(
265-
self.dir_ensemble,
266-
'predictions_ensemble_*_*.npy')
266+
glob.escape(self.dir_ensemble),
267+
'predictions_ensemble_*_*.npy',
268+
)
267269

268270
y_ens_files = glob.glob(pred_path)
269271
# no validation predictions so far -- no files
@@ -453,13 +455,21 @@ def get_valid_test_preds(self, selected_keys: list):
453455

454456
for k in selected_keys:
455457
valid_fn = glob.glob(
456-
os.path.join(self.dir_valid, 'predictions_valid_%d_%d.npy'
457-
% (self.read_preds[k]["seed"],
458-
self.read_preds[k]["num_run"])))
458+
os.path.join(
459+
glob.escape(self.dir_valid),
460+
'predictions_valid_%d_%d.npy' % (
461+
self.read_preds[k]["seed"],
462+
self.read_preds[k]["num_run"])
463+
)
464+
)
459465
test_fn = glob.glob(
460-
os.path.join(self.dir_test, 'predictions_test_%d_%d.npy' %
461-
(self.read_preds[k]["seed"],
462-
self.read_preds[k]["num_run"])))
466+
os.path.join(
467+
glob.escape(self.dir_test),
468+
'predictions_test_%d_%d.npy' % (
469+
self.read_preds[k]["seed"],
470+
self.read_preds[k]["num_run"])
471+
)
472+
)
463473

464474
# TODO don't read valid and test if not changed
465475
if len(valid_fn) == 0:
@@ -636,11 +646,11 @@ def predict(self, set_: str,
636646

637647
def _read_np_fn(self, fp):
638648
if self.precision is "16":
639-
predictions = np.load(fp).astype(dtype=np.float16)
649+
predictions = np.load(fp, allow_pickle=True).astype(dtype=np.float16)
640650
elif self.precision is "32":
641-
predictions = np.load(fp).astype(dtype=np.float32)
651+
predictions = np.load(fp, allow_pickle=True).astype(dtype=np.float32)
642652
elif self.precision is "64":
643-
predictions = np.load(fp).astype(dtype=np.float64)
653+
predictions = np.load(fp, allow_pickle=True).astype(dtype=np.float64)
644654
else:
645-
predictions = np.load(fp)
655+
predictions = np.load(fp, allow_pickle=True)
646656
return predictions

autosklearn/util/backend.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,7 @@ def get_smac_output_directory_for_run(self, seed):
244244

245245
def get_smac_output_glob(self, smac_run_id: Union[str, int] = 1) -> str:
246246
return os.path.join(
247-
self.temporary_directory,
247+
glob.escape(self.temporary_directory),
248248
'smac3-output',
249249
'run_%s' % str(smac_run_id),
250250
)
@@ -265,7 +265,7 @@ def save_targets_ensemble(self, targets):
265265
# number of times where we erronously keep a lock on the ensemble
266266
# targets file although the process already was killed
267267
try:
268-
existing_targets = np.load(filepath)
268+
existing_targets = np.load(filepath, allow_pickle=True)
269269
if existing_targets.shape[0] > targets.shape[0] or \
270270
(existing_targets.shape == targets.shape and
271271
np.allclose(existing_targets, targets)):
@@ -278,7 +278,7 @@ def save_targets_ensemble(self, targets):
278278
with lockfile.LockFile(lock_path):
279279
if os.path.exists(filepath):
280280
with open(filepath, 'rb') as fh:
281-
existing_targets = np.load(fh)
281+
existing_targets = np.load(fh, allow_pickle=True)
282282
if existing_targets.shape[0] > targets.shape[0] or \
283283
(existing_targets.shape == targets.shape and
284284
np.allclose(existing_targets, targets)):
@@ -299,7 +299,7 @@ def load_targets_ensemble(self):
299299
lock_path = filepath + '.lock'
300300
with lockfile.LockFile(lock_path):
301301
with open(filepath, 'rb') as fh:
302-
targets = np.load(fh)
302+
targets = np.load(fh, allow_pickle=True)
303303

304304
return targets
305305

@@ -346,8 +346,9 @@ def save_model(self, model, idx, seed):
346346
def list_all_models(self, seed):
347347
model_directory = self.get_model_dir()
348348
if seed >= 0:
349-
model_files = glob.glob(os.path.join(model_directory,
350-
'%s.*.model' % seed))
349+
model_files = glob.glob(
350+
os.path.join(glob.escape(model_directory), '%s.*.model' % seed)
351+
)
351352
else:
352353
model_files = os.listdir(model_directory)
353354
model_files = [os.path.join(model_directory, mf)
@@ -408,9 +409,11 @@ def load_ensemble(self, seed):
408409
self.logger.warning('Directory %s does not exist' % ensemble_dir)
409410
return None
410411

412+
print(seed)
411413
if seed >= 0:
412-
indices_files = glob.glob(os.path.join(ensemble_dir,
413-
'%s.*.ensemble' % seed))
414+
indices_files = glob.glob(
415+
os.path.join(glob.escape(ensemble_dir), '%s.*.ensemble' % seed)
416+
)
414417
indices_files.sort()
415418
else:
416419
indices_files = os.listdir(ensemble_dir)
@@ -419,6 +422,7 @@ def load_ensemble(self, seed):
419422

420423
with open(indices_files[-1], 'rb') as fh:
421424
ensemble_members_run_numbers = pickle.load(fh)
425+
print(indices_files)
422426

423427
return ensemble_members_run_numbers
424428

doc/releases.rst

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,22 @@
1111
Releases
1212
========
1313

14+
Version 0.5.2
15+
=============
16+
17+
* FIX #669: Correctly handle arguments to the ``AutoMLRegressor``
18+
* FIX #667: Auto-sklearn works with numpy 1.16.3 again.
19+
* ADD #676: Allow brackets [ ] inside the temporary and output directory paths.
20+
* ADD #424: (Experimental) scripts to reproduce the results from the original Auto-sklearn paper.
21+
22+
Contributors
23+
************
24+
25+
* Jin Woo Ahn
26+
* Herilalaina Rakotoarison
27+
* Matthias Feurer
28+
* yazanobeidi
29+
1430
Version 0.5.1
1531
=============
1632

scripts/2015_nips_paper/Readme.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
## Reproduce results of Efficient and Robust Automated Machine Learning (Feurer et al.)
2+
This folder contains all necessary scripts in order to reproduce the results shown in
3+
Figure 3 of Efficient and Robust Automated Machine Learning (Feurer et al.). The scripts
4+
can be modified to include different datasets, change the runtime, etc. The scripts only
5+
only handles classification tasks, and balanced accuracy is used as the score metric.
6+
7+
### 1. Creating commands.txt
8+
To run the experiment, first create commands.txt by running:
9+
```bash
10+
cd setup
11+
bash create_commands.sh
12+
```
13+
The script can be modified to run experiments with different settings, i.e.
14+
different runtime and/or different tasks.
15+
16+
### 2. Executing commands.txt
17+
Run each commands in commands.txt:
18+
```bash
19+
cd run
20+
bash run_commands.sh
21+
```
22+
Each command line in commands.txt first executes model fitting, and then creating the
23+
single best and ensemble trajectories. Therefore, the commands can be run in parallel
24+
on a cluster by modifying run_commands.sh.
25+
26+
### 3. Plotting the results
27+
To plot the results, run:
28+
```bash
29+
cd plot
30+
bash plot_ranks.py
31+
```
32+
33+
34+
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
#!/usr/bin/env python3
2+
3+
import csv
4+
import sys
5+
import os
6+
7+
import numpy as np
8+
9+
import pandas as pd
10+
import matplotlib.pyplot as plt
11+
12+
13+
def read_csv(fn, has_header=True, data_type=str):
14+
"""
15+
Function which reads the csv files containing trajectories
16+
of the auto-sklearn runs.
17+
"""
18+
data = list()
19+
header = None
20+
with open(fn, 'r') as csvfile:
21+
csv_reader = csv.reader(csvfile, delimiter=',', quotechar='|')
22+
for row in csv_reader:
23+
if header is None and has_header:
24+
header = row
25+
continue
26+
data.append(list(map(data_type, [i.strip() for i in row])))
27+
return header, data
28+
29+
30+
def fill_trajectory(performance_list, time_list):
31+
# Create n series objects.
32+
series_list = []
33+
for n in range(len(time_list)):
34+
series_list.append(pd.Series(data=performance_list[n], index=time_list[n]))
35+
36+
# Concatenate to one Series with NaN vales.
37+
series = pd.concat(series_list, axis=1)
38+
39+
# Fill missing performance values (NaNs) with last non-NaN value.
40+
series = series.fillna(method='ffill')
41+
42+
# return the trajectories over seeds (series object)
43+
return series
44+
45+
46+
def main():
47+
# name of the file where the plot is stored
48+
saveto = "../plot.png"
49+
# runtime of each experiment
50+
max_runtime = 3600
51+
# folder where all trajectories are stored.
52+
working_directory = "../log_output"
53+
54+
# list of models
55+
model_list = ['vanilla', 'ensemble', 'metalearning', 'meta_ensemble']
56+
57+
# list of seeds
58+
seed_dir = os.path.join(working_directory, 'vanilla')
59+
seed_list = [seed for seed in os.listdir(seed_dir)]
60+
61+
# list of tasks
62+
vanilla_task_dir = os.path.join(seed_dir, seed_list[0])
63+
task_list = [task_id for task_id in os.listdir(vanilla_task_dir)]
64+
65+
# Step 1. Merge all trajectories into one Dataframe object.
66+
#####################################################################################
67+
all_trajectories = []
68+
69+
for model in model_list:
70+
trajectories = []
71+
for task_id in task_list:
72+
csv_files = []
73+
74+
for seed in seed_list:
75+
# collect all csv files of different seeds for current model and
76+
# current task.
77+
if model in ['vanilla', 'ensemble']:
78+
csv_file = os.path.join(working_directory,
79+
'vanilla',
80+
seed,
81+
task_id,
82+
"score_{}.csv".format(model)
83+
)
84+
85+
elif model in ['metalearning', 'meta_ensemble']:
86+
csv_file = os.path.join(working_directory,
87+
'metalearning',
88+
seed,
89+
task_id,
90+
"score_{}.csv".format(model),
91+
)
92+
csv_files.append(csv_file)
93+
94+
performance_list = []
95+
time_list = []
96+
97+
# Get data from csv
98+
for fl in csv_files:
99+
_, csv_data = read_csv(fl, has_header=True)
100+
csv_data = np.array(csv_data)
101+
# Replace too high values with args.maxsize
102+
data = [min([sys.maxsize, float(i.strip())]) for i in
103+
csv_data[:, 2]] # test trajectories are stored in third column
104+
105+
time_steps = [float(i.strip()) for i in csv_data[:, 0]]
106+
assert time_steps[0] == 0
107+
108+
performance_list.append(data)
109+
time_list.append(time_steps)
110+
111+
# trajectory is the pd.Series object containing all seed runs of the
112+
# current model and current task.
113+
trajectory = fill_trajectory(performance_list, time_list)
114+
trajectories.append(trajectory)
115+
116+
# list[list[pd.Series]]
117+
all_trajectories.append(trajectories)
118+
119+
# Step 2. Compute average ranks of the trajectories.
120+
#####################################################################################
121+
all_rankings = []
122+
n_iter = 500 # number of bootstrap samples to use for estimating the ranks.
123+
n_tasks = len(task_list)
124+
125+
for i in range(n_iter):
126+
pick = np.random.choice(all_trajectories[0][0].shape[1],
127+
size=(len(model_list)))
128+
129+
for j in range(n_tasks):
130+
all_trajectories_tmp = pd.DataFrame(
131+
{model_list[k]: at[j].iloc[:, pick[k]] for
132+
k, at in enumerate(all_trajectories)}
133+
)
134+
all_trajectories_tmp = all_trajectories_tmp.fillna(method='ffill', axis=0)
135+
r_tmp = all_trajectories_tmp.rank(axis=1)
136+
all_rankings.append(r_tmp)
137+
138+
final_ranks = []
139+
for i, model in enumerate(model_list):
140+
ranks_for_model = []
141+
for ranking in all_rankings:
142+
ranks_for_model.append(ranking.loc[:, model])
143+
ranks_for_model = pd.DataFrame(ranks_for_model)
144+
ranks_for_model = ranks_for_model.fillna(method='ffill', axis=1)
145+
final_ranks.append(ranks_for_model.mean(skipna=True))
146+
147+
# Step 3. Plot the average ranks over time.
148+
#####################################################################################
149+
for i, model in enumerate(model_list):
150+
X_data = []
151+
y_data = []
152+
for x, y in final_ranks[i].iteritems():
153+
X_data.append(x)
154+
y_data.append(y)
155+
X_data.append(max_runtime)
156+
y_data.append(y)
157+
plt.plot(X_data, y_data, label=model)
158+
plt.xlabel('time [sec]')
159+
plt.ylabel('average rank')
160+
plt.legend()
161+
plt.savefig(saveto)
162+
163+
164+
if __name__ == "__main__":
165+
main()

0 commit comments

Comments
 (0)