Skip to content

Commit 34d54d9

Browse files
authored
Fix736 (#861)
* Configure StreamHandler and RotatingFileHandler for openml logs. Make sure the openml logger is used instead of the root logger. * Update logger for examples. * mypy/flake8 updates * Configure logging after creating the cache directory (as the file log requires the directory to exist). * Create cache directory (including the cache subdirectory). * Create .openml and .openml/cache separately. * Translate OpenML logging levels to Python. * Log->Print in examples. Fix log levels. Add PR to changelog. * Allow programmatic change of log level, add example. * Add docstring to example file.
1 parent 882b06b commit 34d54d9

File tree

8 files changed

+127
-41
lines changed

8 files changed

+127
-41
lines changed

doc/progress.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ Changelog
1515
* ADD #783: The URL to download the predictions for a run is now stored in the run object.
1616
* ADD #790: Adds the uploader name and id as new filtering options for ``list_evaluations``.
1717
* ADD #792: New convenience function ``openml.flow.get_flow_id``.
18+
* ADD #861: Debug-level log information now being written to a file in the cache directory (at most 2 MB).
1819
* DOC #778: Introduces instructions on how to publish an extension to support other libraries
1920
than scikit-learn.
2021
* DOC #785: The examples section is completely restructured into simple simple examples, advanced
@@ -34,6 +35,7 @@ Changelog
3435
* DOC #834: New example showing how to plot the loss surface for a support vector machine.
3536
* FIX #305: Do not require the external version in the flow XML when loading an object.
3637
* FIX #734: Better handling of *"old"* flows.
38+
* FIX #736: Attach a StreamHandler to the openml logger instead of the root logger.
3739
* FIX #758: Fixes an error which made the client API crash when loading a sparse data with
3840
categorical variables.
3941
* FIX #779: Do not fail on corrupt pickle
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
"""
2+
========
3+
Logging
4+
========
5+
6+
Explains openml-python logging, and shows how to configure it.
7+
"""
8+
##################################################################################
9+
# Logging
10+
# ^^^^^^^
11+
# Openml-python uses the `Python logging module <https://docs.python.org/3/library/logging.html>`_
12+
# to provide users with log messages. Each log message is assigned a level of importance, see
13+
# the table in Python's logging tutorial
14+
# `here <https://docs.python.org/3/howto/logging.html#when-to-use-logging>`_.
15+
#
16+
# By default, openml-python will print log messages of level `WARNING` and above to console.
17+
# All log messages (including `DEBUG` and `INFO`) are also saved in a file, which can be
18+
# found in your cache directory (see also the
19+
# `introduction tutorial <../20_basic/introduction_tutorial.html>`_).
20+
# These file logs are automatically deleted if needed, and use at most 2MB of space.
21+
#
22+
# It is possible to configure what log levels to send to console and file.
23+
# When downloading a dataset from OpenML, a `DEBUG`-level message is written:
24+
25+
import openml
26+
openml.datasets.get_dataset('iris')
27+
28+
# With default configuration, the above example will show no output to console.
29+
# However, in your cache directory you should find a file named 'openml_python.log',
30+
# which has a DEBUG message written to it. It should be either like
31+
# "[DEBUG] [10:46:19:openml.datasets.dataset] Saved dataset 61: iris to file ..."
32+
# or like
33+
# "[DEBUG] [10:49:38:openml.datasets.dataset] Data pickle file already exists and is up to date."
34+
# , depending on whether or not you had downloaded iris before.
35+
# The processed log levels can be configured programmatically:
36+
37+
import logging
38+
openml.config.console_log.setLevel(logging.DEBUG)
39+
openml.config.file_log.setLevel(logging.WARNING)
40+
openml.datasets.get_dataset('iris')
41+
42+
# Now the log level that was previously written to file should also be shown in the console.
43+
# The message is now no longer written to file as the `file_log` was set to level `WARNING`.
44+
#
45+
# It is also possible to specify the desired log levels through the configuration file.
46+
# This way you will not need to set them on each script separately.
47+
# Add the line **verbosity = NUMBER** and/or **file_verbosity = NUMBER** to the config file,
48+
# where 'NUMBER' should be one of:
49+
#
50+
# * 0: `logging.WARNING` and up.
51+
# * 1: `logging.INFO` and up.
52+
# * 2: `logging.DEBUG` and up (i.e. all messages).

examples/30_extended/run_setup_tutorial.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,13 @@
2929
connects to the test server at test.openml.org. This prevents the main
3030
server from crowding with example datasets, tasks, runs, and so on.
3131
"""
32-
import logging
3332
import numpy as np
3433
import openml
3534
import sklearn.ensemble
3635
import sklearn.impute
3736
import sklearn.preprocessing
3837

3938

40-
root = logging.getLogger()
41-
root.setLevel(logging.INFO)
4239
openml.config.start_using_configuration_for_example()
4340

4441
###############################################################################

examples/40_paper/2018_kdd_rijn_example.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,22 +15,20 @@
1515
| In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018
1616
| Available at https://dl.acm.org/citation.cfm?id=3220058
1717
"""
18-
import json
19-
import logging
2018
import sys
2119

2220
if sys.platform == 'win32': # noqa
23-
logging.warning('The pyrfr library (requirement of fanova) can currently not be installed on Windows systems')
21+
print('The pyrfr library (requirement of fanova) can currently not be installed on Windows systems')
2422
exit()
23+
24+
import json
2525
import fanova
2626
import matplotlib.pyplot as plt
2727
import pandas as pd
2828
import seaborn as sns
2929

3030
import openml
3131

32-
root = logging.getLogger()
33-
root.setLevel(logging.INFO)
3432

3533
##############################################################################
3634
# With the advent of automated machine learning, automated hyperparameter
@@ -80,8 +78,8 @@
8078
for idx, task_id in enumerate(suite.tasks):
8179
if limit_nr_tasks is not None and idx >= limit_nr_tasks:
8280
continue
83-
logging.info('Starting with task %d (%d/%d)' % (task_id, idx+1,
84-
len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks))
81+
print('Starting with task %d (%d/%d)'
82+
% (task_id, idx+1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks))
8583
# note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop)
8684
evals = openml.evaluations.list_evaluations_setups(
8785
evaluation_measure, flow=[flow_id], task=[task_id], size=limit_per_task, output_format='dataframe')
@@ -98,7 +96,7 @@
9896
**{performance_column: setup[performance_column]})
9997
for _, setup in evals.iterrows()])
10098
except json.decoder.JSONDecodeError as e:
101-
logging.warning('Task %d error: %s' % (task_id, e))
99+
print('Task %d error: %s' % (task_id, e))
102100
continue
103101
# apply our filters, to have only the setups that comply to the hyperparameters we want
104102
for filter_key, filter_value in parameter_filters.items():
@@ -127,7 +125,7 @@
127125
# functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant
128126
# for all configurations (there is no variance). We will skip these tasks (like the authors did in the
129127
# paper).
130-
logging.warning('Task %d error: %s' % (task_id, e))
128+
print('Task %d error: %s' % (task_id, e))
131129
continue
132130

133131
# transform ``fanova_results`` from a list of dicts into a DataFrame

openml/config.py

Lines changed: 49 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,23 +2,49 @@
22
Store module level information like the API key, cache directory and the server
33
"""
44
import logging
5+
import logging.handlers
56
import os
7+
from typing import cast
68

79
from io import StringIO
810
import configparser
911
from urllib.parse import urlparse
1012

11-
1213
logger = logging.getLogger(__name__)
13-
logging.basicConfig(
14-
format='[%(levelname)s] [%(asctime)s:%(name)s] %('
15-
'message)s', datefmt='%H:%M:%S')
1614

17-
# Default values!
15+
16+
def configure_logging(console_output_level: int, file_output_level: int):
17+
""" Sets the OpenML logger to DEBUG, with attached Stream- and FileHandler. """
18+
# Verbosity levels as defined (https://github.com/openml/OpenML/wiki/Client-API-Standards)
19+
# don't match Python values directly:
20+
verbosity_map = {0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG}
21+
22+
openml_logger = logging.getLogger('openml')
23+
openml_logger.setLevel(logging.DEBUG)
24+
message_format = '[%(levelname)s] [%(asctime)s:%(name)s] %(message)s'
25+
output_formatter = logging.Formatter(message_format, datefmt='%H:%M:%S')
26+
27+
console_stream = logging.StreamHandler()
28+
console_stream.setFormatter(output_formatter)
29+
console_stream.setLevel(verbosity_map[console_output_level])
30+
31+
one_mb = 2**20
32+
log_path = os.path.join(cache_directory, 'openml_python.log')
33+
file_stream = logging.handlers.RotatingFileHandler(log_path, maxBytes=one_mb, backupCount=1)
34+
file_stream.setLevel(verbosity_map[file_output_level])
35+
file_stream.setFormatter(output_formatter)
36+
37+
openml_logger.addHandler(console_stream)
38+
openml_logger.addHandler(file_stream)
39+
return console_stream, file_stream
40+
41+
42+
# Default values (see also https://github.com/openml/OpenML/wiki/Client-API-Standards)
1843
_defaults = {
1944
'apikey': None,
2045
'server': "https://www.openml.org/api/v1/xml",
21-
'verbosity': 0,
46+
'verbosity': 0, # WARNING
47+
'file_verbosity': 2, # DEBUG
2248
'cachedir': os.path.expanduser(os.path.join('~', '.openml', 'cache')),
2349
'avoid_duplicate_runs': 'True',
2450
'connection_n_retries': 2,
@@ -32,7 +58,7 @@
3258
server_base_url = server[:-len('/api/v1/xml')]
3359
apikey = _defaults['apikey']
3460
# The current cache directory (without the server name)
35-
cache_directory = _defaults['cachedir']
61+
cache_directory = str(_defaults['cachedir']) # so mypy knows it is a string
3662
avoid_duplicate_runs = True if _defaults['avoid_duplicate_runs'] == 'True' else False
3763

3864
# Number of retries if the connection breaks
@@ -101,19 +127,28 @@ def _setup():
101127
global cache_directory
102128
global avoid_duplicate_runs
103129
global connection_n_retries
130+
104131
# read config file, create cache directory
105132
try:
106133
os.mkdir(os.path.expanduser(os.path.join('~', '.openml')))
107-
except (IOError, OSError):
108-
# TODO add debug information
134+
except FileExistsError:
135+
# For other errors, we want to propagate the error as openml does not work without cache
109136
pass
137+
110138
config = _parse_config()
111139
apikey = config.get('FAKE_SECTION', 'apikey')
112140
server = config.get('FAKE_SECTION', 'server')
113141

114142
short_cache_dir = config.get('FAKE_SECTION', 'cachedir')
115143
cache_directory = os.path.expanduser(short_cache_dir)
116144

145+
# create the cache subdirectory
146+
try:
147+
os.mkdir(cache_directory)
148+
except FileExistsError:
149+
# For other errors, we want to propagate the error as openml does not work without cache
150+
pass
151+
117152
avoid_duplicate_runs = config.getboolean('FAKE_SECTION',
118153
'avoid_duplicate_runs')
119154
connection_n_retries = config.get('FAKE_SECTION', 'connection_n_retries')
@@ -147,7 +182,7 @@ def _parse_config():
147182
config_file_.seek(0)
148183
config.read_file(config_file_)
149184
except OSError as e:
150-
logging.info("Error opening file %s: %s", config_file, e.message)
185+
logger.info("Error opening file %s: %s", config_file, e.message)
151186
return config
152187

153188

@@ -204,3 +239,7 @@ def set_cache_directory(cachedir):
204239
]
205240

206241
_setup()
242+
243+
_console_log_level = cast(int, _defaults['verbosity'])
244+
_file_log_level = cast(int, _defaults['file_verbosity'])
245+
console_log, file_log = configure_logging(_console_log_level, _file_log_level)

openml/datasets/dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -441,7 +441,7 @@ def _load_data(self):
441441
with open(self.data_pickle_file, "rb") as fh:
442442
data, categorical, attribute_names = pickle.load(fh)
443443
except EOFError:
444-
logging.warning(
444+
logger.warning(
445445
"Detected a corrupt cache file loading dataset %d: '%s'. "
446446
"We will continue loading data from the arff-file, "
447447
"but this will be much slower for big datasets. "
@@ -512,7 +512,7 @@ def _encode_if_category(column):
512512
return data
513513
else:
514514
data_type = "sparse-data" if scipy.sparse.issparse(data) else "non-sparse data"
515-
logging.warning(
515+
logger.warning(
516516
"Cannot convert %s (%s) to '%s'. Returning input data."
517517
% (data_type, type(data), array_format)
518518
)

openml/extensions/sklearn/extension.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@
3434
OpenMLRegressionTask,
3535
)
3636

37+
logger = logging.getLogger(__name__)
38+
3739

3840
if sys.version_info >= (3, 5):
3941
from json.decoder import JSONDecodeError
@@ -271,9 +273,8 @@ def _deserialize_sklearn(
271273
mixed
272274
"""
273275

274-
logging.info('-%s flow_to_sklearn START o=%s, components=%s, '
275-
'init_defaults=%s' % ('-' * recursion_depth, o, components,
276-
initialize_with_defaults))
276+
logger.info('-%s flow_to_sklearn START o=%s, components=%s, init_defaults=%s'
277+
% ('-' * recursion_depth, o, components, initialize_with_defaults))
277278
depth_pp = recursion_depth + 1 # shortcut var, depth plus plus
278279

279280
# First, we need to check whether the presented object is a json string.
@@ -376,8 +377,7 @@ def _deserialize_sklearn(
376377
)
377378
else:
378379
raise TypeError(o)
379-
logging.info('-%s flow_to_sklearn END o=%s, rval=%s'
380-
% ('-' * recursion_depth, o, rval))
380+
logger.info('-%s flow_to_sklearn END o=%s, rval=%s' % ('-' * recursion_depth, o, rval))
381381
return rval
382382

383383
def model_to_flow(self, model: Any) -> 'OpenMLFlow':
@@ -537,16 +537,16 @@ def match_format(s):
537537
s = "{}...".format(s[:char_lim - 3])
538538
return s.strip()
539539
except ValueError:
540-
logging.warning("'Read more' not found in descriptions. "
541-
"Trying to trim till 'Parameters' if available in docstring.")
540+
logger.warning("'Read more' not found in descriptions. "
541+
"Trying to trim till 'Parameters' if available in docstring.")
542542
pass
543543
try:
544544
# if 'Read more' doesn't exist, trim till 'Parameters'
545545
pattern = "Parameters"
546546
index = s.index(match_format(pattern))
547547
except ValueError:
548548
# returning full docstring
549-
logging.warning("'Parameters' not found in docstring. Omitting docstring trimming.")
549+
logger.warning("'Parameters' not found in docstring. Omitting docstring trimming.")
550550
index = len(s)
551551
s = s[:index]
552552
# trimming docstring to be within char_lim
@@ -580,7 +580,7 @@ def match_format(s):
580580
index1 = s.index(match_format("Parameters"))
581581
except ValueError as e:
582582
# when sklearn docstring has no 'Parameters' section
583-
logging.warning("{} {}".format(match_format("Parameters"), e))
583+
logger.warning("{} {}".format(match_format("Parameters"), e))
584584
return None
585585

586586
headings = ["Attributes", "Notes", "See also", "Note", "References"]
@@ -590,7 +590,7 @@ def match_format(s):
590590
index2 = s.index(match_format(h))
591591
break
592592
except ValueError:
593-
logging.warning("{} not available in docstring".format(h))
593+
logger.warning("{} not available in docstring".format(h))
594594
continue
595595
else:
596596
# in the case only 'Parameters' exist, trim till end of docstring
@@ -975,7 +975,7 @@ def _deserialize_model(
975975
recursion_depth: int,
976976
strict_version: bool = True
977977
) -> Any:
978-
logging.info('-%s deserialize %s' % ('-' * recursion_depth, flow.name))
978+
logger.info('-%s deserialize %s' % ('-' * recursion_depth, flow.name))
979979
model_name = flow.class_name
980980
self._check_dependencies(flow.dependencies,
981981
strict_version=strict_version)
@@ -993,8 +993,7 @@ def _deserialize_model(
993993

994994
for name in parameters:
995995
value = parameters.get(name)
996-
logging.info('--%s flow_parameter=%s, value=%s' %
997-
('-' * recursion_depth, name, value))
996+
logger.info('--%s flow_parameter=%s, value=%s' % ('-' * recursion_depth, name, value))
998997
rval = self._deserialize_sklearn(
999998
value,
1000999
components=components_,
@@ -1010,8 +1009,7 @@ def _deserialize_model(
10101009
if name not in components_:
10111010
continue
10121011
value = components[name]
1013-
logging.info('--%s flow_component=%s, value=%s'
1014-
% ('-' * recursion_depth, name, value))
1012+
logger.info('--%s flow_component=%s, value=%s' % ('-' * recursion_depth, name, value))
10151013
rval = self._deserialize_sklearn(
10161014
value,
10171015
recursion_depth=recursion_depth + 1,

tests/conftest.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
# finding the root directory of conftest.py and going up to OpenML main directory
3838
# exploiting the fact that conftest.py always resides in the root directory for tests
3939
static_dir = os.path.dirname(os.path.abspath(__file__))
40-
logging.info("static directory: {}".format(static_dir))
40+
logger.info("static directory: {}".format(static_dir))
4141
print("static directory: {}".format(static_dir))
4242
while True:
4343
if 'openml' in os.listdir(static_dir):
@@ -178,4 +178,4 @@ def pytest_sessionfinish() -> None:
178178
compare_delete_files(file_list, new_file_list)
179179
logger.info("Local files deleted")
180180

181-
logging.info("{} is killed".format(worker))
181+
logger.info("{} is killed".format(worker))

0 commit comments

Comments
 (0)