Skip to content

Commit 3743b25

Browse files
First push to prevent files in working dir (#1021)
1 parent c1fbd6c commit 3743b25

File tree

11 files changed

+190
-86
lines changed

11 files changed

+190
-86
lines changed

.github/workflows/pytest.yml

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ jobs:
5656
python setup.py sdist
5757
last_dist=$(ls -t dist/auto-sklearn-*.tar.gz | head -n 1)
5858
pip install $last_dist
59+
- name: Store repository status
60+
id: status-before
61+
run: |
62+
echo "::set-output name=BEFORE::$(git status --porcelain -b)"
5963
- name: Conda Run tests
6064
if: matrix.use-conda == true
6165
run: |
@@ -66,15 +70,26 @@ jobs:
6670
# to change the default python
6771
export PATH="$CONDA/envs/testenv/bin:$PATH"
6872
if [ ${{ matrix.code-cov }} ]; then codecov='--cov=autosklearn --cov-report=xml'; fi
69-
$CONDA/envs/testenv/bin/python3 -m pytest --durations=20 -sv $codecov test
73+
$CONDA/envs/testenv/bin/python3 -m pytest --durations=20 --timeout=300 --timeout-method=thread -v $codecov test
7074
- name: Run tests
7175
if: matrix.use-conda == false
7276
run: |
7377
export OPENBLAS_NUM_THREADS=1
7478
export OMP_NUM_THREADS=1
7579
export MKL_NUM_THREADS=1
7680
if [ ${{ matrix.code-cov }} ]; then codecov='--cov=autosklearn --cov-report=xml'; fi
77-
pytest --durations=20 -sv $codecov test
81+
pytest --durations=20 --timeout=300 --timeout-method=thread -v $codecov test
82+
- name: Check for files left behind by test
83+
if: ${{ always() }}
84+
run: |
85+
before="${{ steps.status-before.outputs.BEFORE }}"
86+
after="$(git status --porcelain -b)"
87+
if [[ "$before" != "$after" ]]; then
88+
echo "git status from before: $before"
89+
echo "git status from after: $after"
90+
echo "Not all generated files have been deleted!"
91+
exit 1
92+
fi
7893
- name: Upload coverage
7994
if: matrix.code-cov && always()
8095
uses: codecov/codecov-action@v1

.gitignore

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,24 @@ number_submission
5454
.pypirc
5555
dmypy.json
5656
*.log
57+
58+
# Dask created work space
59+
dask-worker-space
60+
61+
# Python distribution generated files
62+
.eggs
63+
64+
# Unit test / coverage reports
65+
htmlcov/
66+
cover
67+
coverage
68+
htmlcov
69+
.tox/
70+
.coverage
71+
.coverage.*
72+
.cache
73+
nosetests.xml
74+
coverage.xml
75+
*,cover
76+
.hypothesis/
77+
prof/

autosklearn/automl.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import tempfile
1515

1616
from ConfigSpace.read_and_write import json as cs_json
17+
import dask
1718
import dask.distributed
1819
import numpy as np
1920
import numpy.ma as ma
@@ -230,10 +231,11 @@ def __init__(self,
230231

231232
def _create_dask_client(self):
232233
self._is_dask_client_internally_created = True
234+
dask.config.set({'distributed.worker.daemon': False})
233235
self._dask_client = dask.distributed.Client(
234236
dask.distributed.LocalCluster(
235237
n_workers=self._n_jobs,
236-
processes=False,
238+
processes=True,
237239
threads_per_worker=1,
238240
# We use the temporal directory to save the
239241
# dask workers, because deleting workers
@@ -269,9 +271,7 @@ def _get_logger(self, name):
269271
# This is gonna be honored by the server
270272
# Which is created below
271273
setup_logger(
272-
output_file=os.path.join(
273-
self._backend.temporary_directory, '%s.log' % str(logger_name)
274-
),
274+
filename='%s.log' % str(logger_name),
275275
logging_config=self.logging_config,
276276
output_dir=self._backend.temporary_directory,
277277
)
@@ -294,9 +294,7 @@ def _get_logger(self, name):
294294
logname=logger_name,
295295
event=self.stop_logging_server,
296296
port=port,
297-
output_file=os.path.join(
298-
self._backend.temporary_directory, '%s.log' % str(logger_name)
299-
),
297+
filename='%s.log' % str(logger_name),
300298
logging_config=self.logging_config,
301299
output_dir=self._backend.temporary_directory,
302300
),

autosklearn/ensemble_builder.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -454,7 +454,11 @@ def __init__(
454454

455455
# Setup the logger
456456
self.logger_port = logger_port
457-
self.logger = get_named_client_logger('EnsembleBuilder', port=self.logger_port)
457+
self.logger = get_named_client_logger(
458+
name='EnsembleBuilder',
459+
port=self.logger_port,
460+
output_dir=self.backend.temporary_directory,
461+
)
458462

459463
if ensemble_nbest == 1:
460464
self.logger.debug("Behaviour depends on int/float: %s, %s (ensemble_nbest, type)" %
@@ -556,7 +560,11 @@ def run(
556560
elif time_left is not None and end_at is not None:
557561
raise ValueError('Cannot provide both time_left and end_at.')
558562

559-
self.logger = get_named_client_logger('EnsembleBuilder', port=self.logger_port)
563+
self.logger = get_named_client_logger(
564+
name='EnsembleBuilder',
565+
port=self.logger_port,
566+
output_dir=self.backend.temporary_directory,
567+
)
560568

561569
process_start_time = time.time()
562570
while True:
@@ -627,7 +635,11 @@ def main(self, time_left, iteration, return_predictions):
627635
# Pynisher jobs inside dask 'forget'
628636
# the logger configuration. So we have to set it up
629637
# accordingly
630-
self.logger = get_named_client_logger('EnsembleBuilder', port=self.logger_port)
638+
self.logger = get_named_client_logger(
639+
name='EnsembleBuilder',
640+
port=self.logger_port,
641+
output_dir=self.backend.temporary_directory,
642+
)
631643

632644
self.start_time = time.time()
633645
train_pred, valid_pred, test_pred = None, None, None

autosklearn/util/backend.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,12 @@ def __init__(self,
8181
)
8282
)
8383
self._output_directory = output_directory
84-
self._logger = logging.get_logger(__name__)
8584
self.create_directories()
85+
# This is the first place the logger gets created.
86+
# We want to make sure any logging forward sets the correct directory
87+
# were all files should be created
88+
logging.setup_logger(output_dir=self._temporary_directory)
89+
self._logger = logging.get_logger(__name__)
8690

8791
@property
8892
def output_directory(self) -> Optional[str]:

autosklearn/util/hash.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
def hash_array_or_matrix(X: np.ndarray) -> str:
99
m = hashlib.md5()
1010

11+
if hasattr(X, "iloc"):
12+
X = X.to_numpy()
13+
1114
if scipy.sparse.issparse(X):
1215
m.update(X.indices)
1316
m.update(X.indptr)

autosklearn/util/logging_.py

Lines changed: 53 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -16,30 +16,29 @@
1616

1717

1818
def setup_logger(
19-
output_file: Optional[str] = None,
19+
output_dir: str,
20+
filename: Optional[str] = None,
21+
distributedlog_filename: Optional[str] = None,
2022
logging_config: Optional[Dict] = None,
21-
output_dir: Optional[str] = None,
2223
) -> None:
2324
# logging_config must be a dictionary object specifying the configuration
2425
# for the loggers to be used in auto-sklearn.
25-
if logging_config is not None:
26-
if output_file is not None:
27-
logging_config['handlers']['file_handler']['filename'] = output_file
28-
if output_dir is not None:
29-
logging_config['handlers']['distributed_logfile']['filename'] = os.path.join(
30-
output_dir, 'distributed.log'
31-
)
32-
logging.config.dictConfig(logging_config)
33-
else:
26+
if logging_config is None:
3427
with open(os.path.join(os.path.dirname(__file__), 'logging.yaml'), 'r') as fh:
3528
logging_config = yaml.safe_load(fh)
36-
if output_file is not None:
37-
logging_config['handlers']['file_handler']['filename'] = output_file
38-
if output_dir is not None:
39-
logging_config['handlers']['distributed_logfile']['filename'] = os.path.join(
40-
output_dir, 'distributed.log'
41-
)
42-
logging.config.dictConfig(logging_config)
29+
30+
if filename is None:
31+
filename = logging_config['handlers']['file_handler']['filename']
32+
logging_config['handlers']['file_handler']['filename'] = os.path.join(
33+
output_dir, filename
34+
)
35+
36+
if distributedlog_filename is None:
37+
distributedlog_filename = logging_config['handlers']['distributed_logfile']['filename']
38+
logging_config['handlers']['distributed_logfile']['filename'] = os.path.join(
39+
output_dir, distributedlog_filename
40+
)
41+
logging.config.dictConfig(logging_config)
4342

4443

4544
def _create_logger(name: str) -> logging.Logger:
@@ -107,15 +106,22 @@ def isEnabledFor(self, level: int) -> bool:
107106

108107

109108
def get_named_client_logger(
109+
output_dir: str,
110110
name: str,
111111
host: str = 'localhost',
112112
port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
113113
) -> 'PicklableClientLogger':
114-
logger = PicklableClientLogger(name, host, port)
114+
logger = PicklableClientLogger(
115+
output_dir=output_dir,
116+
name=name,
117+
host=host,
118+
port=port
119+
)
115120
return logger
116121

117122

118123
def _get_named_client_logger(
124+
output_dir: str,
119125
name: str,
120126
host: str = 'localhost',
121127
port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
@@ -133,6 +139,8 @@ def _get_named_client_logger(
133139
134140
Parameters
135141
----------
142+
outputdir: (str)
143+
The path where the log files are going to be dumped
136144
name: (str)
137145
the name of the logger, used to tag the messages in the main log
138146
host: (str)
@@ -143,7 +151,7 @@ def _get_named_client_logger(
143151
local_loger: a logger object that has a socket handler
144152
"""
145153
# Setup the logger configuration
146-
setup_logger()
154+
setup_logger(output_dir=output_dir)
147155

148156
local_logger = _create_logger(name)
149157

@@ -159,11 +167,17 @@ def _get_named_client_logger(
159167

160168
class PicklableClientLogger(PickableLoggerAdapter):
161169

162-
def __init__(self, name: str, host: str, port: int):
170+
def __init__(self, output_dir: str, name: str, host: str, port: int):
171+
self.output_dir = output_dir
163172
self.name = name
164173
self.host = host
165174
self.port = port
166-
self.logger = _get_named_client_logger(name, host, port)
175+
self.logger = _get_named_client_logger(
176+
output_dir=output_dir,
177+
name=name,
178+
host=host,
179+
port=port
180+
)
167181

168182
def __getstate__(self) -> Dict[str, Any]:
169183
"""
@@ -174,7 +188,12 @@ def __getstate__(self) -> Dict[str, Any]:
174188
Dictionary, representing the object state to be pickled. Ignores
175189
the self.logger field and only returns the logger name.
176190
"""
177-
return {'name': self.name, 'host': self.host, 'port': self.port}
191+
return {
192+
'name': self.name,
193+
'host': self.host,
194+
'port': self.port,
195+
'output_dir': self.output_dir,
196+
}
178197

179198
def __setstate__(self, state: Dict[str, Any]) -> None:
180199
"""
@@ -189,7 +208,13 @@ def __setstate__(self, state: Dict[str, Any]) -> None:
189208
self.name = state['name']
190209
self.host = state['host']
191210
self.port = state['port']
192-
self.logger = _get_named_client_logger(self.name, self.host, self.port)
211+
self.output_dir = state['output_dir']
212+
self.logger = _get_named_client_logger(
213+
name=self.name,
214+
host=self.host,
215+
port=self.port,
216+
output_dir=self.output_dir,
217+
)
193218

194219

195220
class LogRecordStreamHandler(socketserver.StreamRequestHandler):
@@ -242,11 +267,13 @@ def start_log_server(
242267
logname: str,
243268
event: threading.Event,
244269
port: multiprocessing.Value,
245-
output_file: str,
270+
filename: str,
246271
logging_config: Dict,
247272
output_dir: str,
248273
) -> None:
249-
setup_logger(output_file, logging_config, output_dir)
274+
setup_logger(filename=filename,
275+
logging_config=logging_config,
276+
output_dir=output_dir)
250277

251278
while True:
252279
# Loop until we find a valid port

test/conftest.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import time
44
import unittest.mock
55

6+
import dask
67
from dask.distributed import Client, get_client
78
import psutil
89
import pytest
@@ -124,7 +125,8 @@ def dask_client(request):
124125
Workers are in subprocesses to not create deadlocks with the pynisher and logging.
125126
"""
126127

127-
client = Client(n_workers=2, threads_per_worker=1, processes=False)
128+
dask.config.set({'distributed.worker.daemon': False})
129+
client = Client(n_workers=2, threads_per_worker=1, processes=True)
128130
print("Started Dask client={}\n".format(client))
129131

130132
def get_finalizer(address):
@@ -149,6 +151,7 @@ def dask_client_single_worker(request):
149151
it is used very rarely to avoid this issue as much as possible.
150152
"""
151153

154+
dask.config.set({'distributed.worker.daemon': False})
152155
client = Client(n_workers=1, threads_per_worker=1, processes=False)
153156
print("Started Dask client={}\n".format(client))
154157

0 commit comments

Comments
 (0)