Skip to content

Commit 1c87889

Browse files
committed
MAINT zip was horribly slow, dump it again
1 parent 469b0ba commit 1c87889

File tree

5 files changed

+43
-110
lines changed

5 files changed

+43
-110
lines changed

autosklearn/ensemble_builder.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import multiprocessing
44
import glob
5-
import gzip
65
import os
76
import re
87
import sys
@@ -100,15 +99,15 @@ def main(self):
10099

101100
if self.shared_mode is False:
102101
dir_ensemble_list = sorted(glob.glob(os.path.join(
103-
dir_ensemble, 'predictions_ensemble_%s_*.npy.gz' % self.seed)))
102+
dir_ensemble, 'predictions_ensemble_%s_*.npy' % self.seed)))
104103
if exists[1]:
105104
dir_valid_list = sorted(glob.glob(os.path.join(
106-
dir_valid, 'predictions_valid_%s_*.npy.gz' % self.seed)))
105+
dir_valid, 'predictions_valid_%s_*.npy' % self.seed)))
107106
else:
108107
dir_valid_list = []
109108
if exists[2]:
110109
dir_test_list = sorted(glob.glob(os.path.join(
111-
dir_test, 'predictions_test_%s_*.npy.gz' % self.seed)))
110+
dir_test, 'predictions_test_%s_*.npy' % self.seed)))
112111
else:
113112
dir_test_list = []
114113
else:
@@ -127,8 +126,8 @@ def main(self):
127126
for dir_ensemble_file in dir_ensemble_list:
128127
if dir_ensemble_file.endswith("/"):
129128
dir_ensemble_file = dir_ensemble_file[:-1]
130-
if not dir_ensemble_file.endswith(".npy.gz"):
131-
self.logger.info('Error loading file (not .npy.gz): %s', dir_ensemble_file)
129+
if not dir_ensemble_file.endswith(".npy"):
130+
self.logger.info('Error loading file (not .npy): %s', dir_ensemble_file)
132131
continue
133132

134133
dir_ensemble_model_files.append(dir_ensemble_file)
@@ -161,7 +160,7 @@ def main(self):
161160
# later
162161
include_num_runs = []
163162
backup_num_runs = []
164-
model_and_automl_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy\.gz$')
163+
model_and_automl_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy')
165164
if self.ensemble_nbest is not None:
166165
# Keeps track of the single scores of each model in our ensemble
167166
scores_nbest = []
@@ -179,7 +178,7 @@ def main(self):
179178
basename = os.path.basename(model_name)
180179

181180
try:
182-
with gzip.open(os.path.join(dir_ensemble, basename)) as fh:
181+
with open(os.path.join(dir_ensemble, basename), 'rb') as fh:
183182
if self.precision is "16":
184183
predictions = np.load(fh).astype(dtype=np.float16)
185184
elif self.precision is "32":
@@ -455,7 +454,7 @@ def get_predictions(self, dir_path, dir_path_list, include_num_runs,
455454
basename = os.path.basename(model_name)
456455

457456
if (automl_seed, num_run) in include_num_runs:
458-
with gzip.open(os.path.join(dir_path, basename)) as fh:
457+
with open(os.path.join(dir_path, basename), 'rb') as fh:
459458
if precision == "16":
460459
predictions = np.load(fh).astype(
461460
dtype=np.float16)

autosklearn/util/backend.py

Lines changed: 19 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import glob
2-
import gzip
32
import os
43
import tempfile
54
import time
@@ -170,7 +169,7 @@ def load_start_time(self, seed):
170169

171170
def _get_targets_ensemble_filename(self):
172171
return os.path.join(self.internals_directory,
173-
"true_targets_ensemble.npy.gz")
172+
"true_targets_ensemble.npy")
174173

175174
def save_targets_ensemble(self, targets):
176175
self._make_internals_directory()
@@ -196,7 +195,7 @@ def save_targets_ensemble(self, targets):
196195
lock_path = filepath + '.lock'
197196
with lockfile.LockFile(lock_path):
198197
if os.path.exists(filepath):
199-
with gzip.open(filepath) as fh:
198+
with open(filepath) as fh:
200199
existing_targets = np.load(fh)
201200
if existing_targets.shape[0] > targets.shape[0] or \
202201
(existing_targets.shape == targets.shape and
@@ -205,9 +204,7 @@ def save_targets_ensemble(self, targets):
205204

206205
with tempfile.NamedTemporaryFile('wb', dir=os.path.dirname(
207206
filepath), delete=False) as fh:
208-
zipfile = gzip.GzipFile(fileobj=fh)
209-
np.save(zipfile, targets.astype(np.float32))
210-
zipfile.close()
207+
np.save(fh, targets.astype(np.float32))
211208
tempname = fh.name
212209

213210
os.rename(tempname, filepath)
@@ -219,13 +216,13 @@ def load_targets_ensemble(self):
219216

220217
lock_path = filepath + '.lock'
221218
with lockfile.LockFile(lock_path):
222-
with gzip.open(filepath) as fh:
219+
with open(filepath, 'rb') as fh:
223220
targets = np.load(fh)
224221

225222
return targets
226223

227224
def _get_datamanager_pickle_filename(self):
228-
return os.path.join(self.internals_directory, 'datamanager.pkl.gz')
225+
return os.path.join(self.internals_directory, 'datamanager.pkl')
229226

230227
def save_datamanager(self, datamanager):
231228
self._make_internals_directory()
@@ -236,9 +233,7 @@ def save_datamanager(self, datamanager):
236233
if not os.path.exists(filepath):
237234
with tempfile.NamedTemporaryFile('wb', dir=os.path.dirname(
238235
filepath), delete=False) as fh:
239-
zipfile = gzip.GzipFile(fileobj=fh)
240-
pickle.dump(datamanager, zipfile, -1)
241-
zipfile.close()
236+
pickle.dump(datamanager, fh, -1)
242237
tempname = fh.name
243238
os.rename(tempname, filepath)
244239

@@ -248,35 +243,21 @@ def load_datamanager(self):
248243
filepath = self._get_datamanager_pickle_filename()
249244
lock_path = filepath + '.lock'
250245
with lockfile.LockFile(lock_path):
251-
with gzip.open(filepath, 'rb') as fh:
246+
with open(filepath, 'rb') as fh:
252247
return pickle.load(fh)
253248

254249
def get_model_dir(self):
255250
return os.path.join(self.internals_directory, 'models')
256251

257252
def save_model(self, model, idx, seed):
258253
# This should fail if no models directory exists
259-
try:
260-
filepath = os.path.join(self.get_model_dir(),
261-
'%s.%s.model.gz' % (seed, idx))
254+
filepath = os.path.join(self.get_model_dir(),
255+
'%s.%s.model' % (seed, idx))
262256

263-
with tempfile.NamedTemporaryFile('wb', dir=os.path.dirname(
264-
filepath), delete=False) as fh:
265-
zipfile = gzip.GzipFile(fileobj=fh)
266-
pickle.dump(model, zipfile, -1)
267-
zipfile.close()
257+
with tempfile.NamedTemporaryFile('wb', dir=os.path.dirname(
258+
filepath), delete=False) as fh:
259+
pickle.dump(model, fh, -1)
268260
tempname = fh.name
269-
# Actually I would like to catch a RecursionError here, but it turns out
270-
# that it was added in python3.5 and cannot be used in python3.4. But
271-
# since it is a subclass of RuntimeError this works fine as well
272-
except RuntimeError:
273-
filepath = os.path.join(self.get_model_dir(),
274-
'%s.%s.model' % (seed, idx))
275-
276-
with tempfile.NamedTemporaryFile('wb', dir=os.path.dirname(
277-
filepath), delete=False) as fh:
278-
pickle.dump(model, fh, -1)
279-
tempname = fh.name
280261

281262
os.rename(tempname, filepath)
282263

@@ -285,9 +266,7 @@ def load_all_models(self, seed):
285266

286267
if seed >= 0:
287268
model_files = glob.glob(os.path.join(model_directory,
288-
'%s.*.model.gz' % seed))
289-
model_files.extend(glob.glob(os.path.join(model_directory,
290-
'%s.*.model' % seed)))
269+
'%s.*.model' % seed))
291270
else:
292271
model_files = os.listdir(model_directory)
293272
model_files = [os.path.join(model_directory, mf) for mf in model_files]
@@ -303,7 +282,7 @@ def load_models_by_file_names(self, model_file_names):
303282
# File names are like: {seed}.{index}.model
304283
if model_file.endswith('/'):
305284
model_file = model_file[:-1]
306-
if not model_file.endswith('.model.gz') and \
285+
if not model_file.endswith('.model') and \
307286
not model_file.endswith('.model'):
308287
continue
309288

@@ -329,14 +308,10 @@ def load_models_by_identifiers(self, identifiers):
329308
def load_model_by_seed_and_id(self, seed, idx):
330309
model_directory = self.get_model_dir()
331310

332-
model_file_name = '%s.%s.model.gz' % (seed, idx)
311+
model_file_name = '%s.%s.model' % (seed, idx)
333312
model_file_path = os.path.join(model_directory, model_file_name)
334-
if os.path.exists(model_file_path):
335-
with gzip.open(model_file_path, 'rb') as fh:
336-
return pickle.load(fh)
337-
else:
338-
with open(model_file_path[:-3], 'rb') as fh:
339-
return pickle.load(fh)
313+
with open(model_file_path, 'rb') as fh:
314+
return pickle.load(fh)
340315

341316
def get_ensemble_dir(self):
342317
return os.path.join(self.internals_directory, 'ensembles')
@@ -387,14 +362,12 @@ def save_predictions_as_npy(self, predictions, subset, automl_seed, idx):
387362
if not os.path.exists(output_dir):
388363
os.makedirs(output_dir)
389364

390-
filepath = os.path.join(output_dir, 'predictions_%s_%s_%s.npy.gz' %
365+
filepath = os.path.join(output_dir, 'predictions_%s_%s_%s.npy' %
391366
(subset, automl_seed, str(idx)))
392367

393368
with tempfile.NamedTemporaryFile('wb', dir=os.path.dirname(
394369
filepath), delete=False) as fh:
395-
zipfile = gzip.GzipFile(fileobj=fh)
396-
pickle.dump(predictions.astype(np.float32), zipfile, -1)
397-
zipfile.close()
370+
pickle.dump(predictions.astype(np.float32), fh, -1)
398371
tempname = fh.name
399372
os.rename(tempname, filepath)
400373

test/test_automl/test_automl.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
# -*- encoding: utf-8 -*-
2-
import gzip
32
import os
43
import pickle
54
import sys
@@ -167,7 +166,7 @@ def test_automl_outputs(self):
167166
name = '31_bac'
168167
dataset = os.path.join(self.test_dir, '..', '.data', name)
169168
data_manager_file = os.path.join(output, '.auto-sklearn',
170-
'datamanager.pkl.gz')
169+
'datamanager.pkl')
171170

172171
backend_api = backend.create(output, output)
173172
auto = autosklearn.automl.AutoML(
@@ -177,14 +176,14 @@ def test_automl_outputs(self):
177176
auto.fit_automl_dataset(dataset)
178177

179178
# pickled data manager (without one hot encoding!)
180-
with gzip.open(data_manager_file, 'rb') as fh:
179+
with open(data_manager_file, 'rb') as fh:
181180
D = pickle.load(fh)
182181
self.assertTrue(np.allclose(D.data['X_train'][0, :3],
183182
[1., 12., 2.]))
184183

185184
# Check that all directories are there
186-
fixture = ['predictions_valid', 'true_targets_ensemble.npy.gz',
187-
'start_time_100', 'datamanager.pkl.gz',
185+
fixture = ['predictions_valid', 'true_targets_ensemble.npy',
186+
'start_time_100', 'datamanager.pkl',
188187
'predictions_ensemble',
189188
'ensembles', 'predictions_test', 'models']
190189
self.assertEqual(sorted(os.listdir(os.path.join(output,
@@ -195,11 +194,11 @@ def test_automl_outputs(self):
195194
# model and one ensemble
196195
fixture = os.listdir(os.path.join(output, '.auto-sklearn',
197196
'predictions_ensemble'))
198-
self.assertIn('predictions_ensemble_100_00001.npy.gz', fixture)
197+
self.assertIn('predictions_ensemble_100_00001.npy', fixture)
199198

200199
fixture = os.listdir(os.path.join(output, '.auto-sklearn',
201200
'models'))
202-
self.assertIn('100.1.model.gz', fixture)
201+
self.assertIn('100.1.model', fixture)
203202

204203
fixture = os.listdir(os.path.join(output, '.auto-sklearn',
205204
'ensembles'))
@@ -240,7 +239,7 @@ def test_do_dummy_prediction(self):
240239
'.auto-sklearn')))
241240
self.assertTrue(os.path.exists(os.path.join(
242241
output, '.auto-sklearn', 'predictions_ensemble',
243-
'predictions_ensemble_1_00001.npy.gz')))
242+
'predictions_ensemble_1_00001.npy')))
244243

245244
del auto
246245
self._tearDown(output)

test/test_automl/test_estimators.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,17 +109,17 @@ def test_fit_pSMAC(self):
109109
# more than 99%; it should be in the final ensemble if the ensemble
110110
# building of the second AutoSklearn classifier works correct
111111
true_targets_ensemble_path = os.path.join(output, '.auto-sklearn',
112-
'true_targets_ensemble.npy.gz')
113-
with gzip.open(true_targets_ensemble_path) as fh:
112+
'true_targets_ensemble.npy')
113+
with open(true_targets_ensemble_path, 'rb') as fh:
114114
true_targets_ensemble = np.load(fh)
115115
true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0
116116
probas = np.zeros((len(true_targets_ensemble), 3), dtype=float)
117117
for i, value in enumerate(true_targets_ensemble):
118118
probas[i, value] = 1.0
119119
dummy_predictions_path = os.path.join(output, '.auto-sklearn',
120120
'predictions_ensemble',
121-
'predictions_ensemble_1_00030.npy.gz')
122-
with gzip.open(dummy_predictions_path, 'wb') as fh:
121+
'predictions_ensemble_1_00030.npy')
122+
with open(dummy_predictions_path, 'wb') as fh:
123123
np.save(fh, probas)
124124

125125
probas_test = np.zeros((len(Y_test), 3), dtype=float)

test/test_util/test_backend.py

Lines changed: 5 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# -*- encoding: utf-8 -*-
2+
import builtins
23
import sys
34
import unittest
45
import unittest.mock
@@ -23,28 +24,12 @@ def setUp(self):
2324
def test_load_models_by_file_names(self):
2425
self.backend.load_model_by_seed_and_id = unittest.mock.Mock()
2526
self.backend.load_model_by_seed_and_id.side_effect = lambda *args: args
26-
rval = self.backend.load_models_by_file_names(['1.2.model.gz',
27+
rval = self.backend.load_models_by_file_names(['1.2.model',
2728
'1.3.model',
2829
'1.4.models'])
2930
self.assertEqual(rval, {(1, 2): (1, 2),
3031
(1, 3): (1, 3)})
3132

32-
@unittest.mock.patch('pickle.load')
33-
@unittest.mock.patch('gzip.open')
34-
@unittest.mock.patch('os.path.exists')
35-
def test_load_model_by_seed_and_id_gz(self, exists_mock, openMock,
36-
pickleLoadMock):
37-
exists_mock.return_value = True
38-
seed = 13
39-
idx = 17
40-
expected_model = self._setup_load_model_mocks(openMock,
41-
pickleLoadMock,
42-
seed, idx)
43-
44-
actual_model = self.backend.load_model_by_seed_and_id(seed, idx)
45-
46-
self.assertEqual(expected_model, actual_model)
47-
4833
@unittest.mock.patch('pickle.load')
4934
@unittest.mock.patch('os.path.exists')
5035
def test_load_model_by_seed_and_id(self, exists_mock, pickleLoadMock):
@@ -55,14 +40,14 @@ def test_load_model_by_seed_and_id(self, exists_mock, pickleLoadMock):
5540
idx = 17
5641
expected_model = self._setup_load_model_mocks(open_mock,
5742
pickleLoadMock,
58-
seed, idx, zip=False)
43+
seed, idx)
5944

6045
actual_model = self.backend.load_model_by_seed_and_id(seed, idx)
6146

6247
self.assertEqual(expected_model, actual_model)
6348

6449
@unittest.mock.patch('pickle.load')
65-
@unittest.mock.patch('gzip.open')
50+
@unittest.mock.patch.object(builtins, 'open')
6651
@unittest.mock.patch('os.path.exists')
6752
def test_loads_models_by_identifiers(self, exists_mock, openMock, pickleLoadMock):
6853
exists_mock.return_value = True
@@ -76,11 +61,8 @@ def test_loads_models_by_identifiers(self, exists_mock, openMock, pickleLoadMock
7661
self.assertIsInstance(actual_dict, dict)
7762
self.assertDictEqual(expected_dict, actual_dict)
7863

79-
def _setup_load_model_mocks(self, openMock, pickleLoadMock, seed, idx,
80-
zip=True):
64+
def _setup_load_model_mocks(self, openMock, pickleLoadMock, seed, idx):
8165
model_path = '/model_directory/%s.%s.model' % (seed, idx)
82-
if zip:
83-
model_path += '.gz'
8466
file_handler = 'file_handler'
8567
expected_model = 'model'
8668

@@ -91,23 +73,3 @@ def _setup_load_model_mocks(self, openMock, pickleLoadMock, seed, idx,
9173
pickleLoadMock.side_effect = lambda fh: expected_model if fh == file_handler else None
9274

9375
return expected_model
94-
95-
@unittest.mock.patch('pickle.dump')
96-
def test_save_model_recursion_depth_error(self, dump_mock):
97-
class SideEffect(object):
98-
def __init__(self):
99-
self.num_calls = 0
100-
101-
def side_effect(self, *args):
102-
self.num_calls += 1
103-
if self.num_calls == 1:
104-
if sys.version_info >= (3, 5):
105-
raise RecursionError
106-
else:
107-
raise RuntimeError
108-
109-
dump_mock.side_effect = SideEffect().side_effect
110-
model = sklearn.tree.DecisionTreeClassifier()
111-
self.backend.get_model_dir = lambda: '/tmp/'
112-
self.backend.save_model(model, 1, 1)
113-
self.assertEqual(dump_mock.call_count, 2)

0 commit comments

Comments
 (0)