Fix736 (#861)

PGijsbers · web-flow · commit 34d54d92ebe6 · 2019-11-05T12:49:53.000+01:00
* Configure StreamHandler and RotatingFileHandler for openml logs. Make sure the openml logger is used instead of the root logger.

* Update logger for examples.

* mypy/flake8 updates

* Configure logging after creating the cache directory (as the file log requires the directory to exist).

* Create cache directory (including the cache subdirectory).

* Create .openml and .openml/cache separately.

* Translate OpenML logging levels to Python.

* Log-&gt;Print in examples. Fix log levels. Add PR to changelog.

* Allow programmatic change of log level, add example.

* Add docstring to example file.
diff --git a/doc/progress.rst b/doc/progress.rst
@@ -15,6 +15,7 @@ Changelog
 * ADD #783: The URL to download the predictions for a run is now stored in the run object.
 * ADD #790: Adds the uploader name and id as new filtering options for ``list_evaluations``.
 * ADD #792: New convenience function ``openml.flow.get_flow_id``.
+* ADD #861: Debug-level log information now being written to a file in the cache directory (at most 2 MB).
 * DOC #778: Introduces instructions on how to publish an extension to support other libraries
   than scikit-learn.
 * DOC #785: The examples section is completely restructured into simple simple examples, advanced
@@ -34,6 +35,7 @@ Changelog
 * DOC #834: New example showing how to plot the loss surface for a support vector machine.
 * FIX #305: Do not require the external version in the flow XML when loading an object.
 * FIX #734: Better handling of *"old"* flows.
+* FIX #736: Attach a StreamHandler to the openml logger instead of the root logger.
 * FIX #758: Fixes an error which made the client API crash when loading a sparse data with
   categorical variables.
 * FIX #779: Do not fail on corrupt pickle
diff --git a/examples/30_extended/configure_logging.py b/examples/30_extended/configure_logging.py
@@ -0,0 +1,52 @@
+"""
+========
+Logging
+========
+
+Explains openml-python logging, and shows how to configure it.
+"""
+##################################################################################
+# Logging
+# ^^^^^^^
+# Openml-python uses the `Python logging module <https://docs.python.org/3/library/logging.html>`_
+# to provide users with log messages. Each log message is assigned a level of importance, see
+# the table in Python's logging tutorial
+# `here <https://docs.python.org/3/howto/logging.html#when-to-use-logging>`_.
+#
+# By default, openml-python will print log messages of level `WARNING` and above to console.
+# All log messages (including `DEBUG` and `INFO`) are also saved in a file, which can be
+# found in your cache directory (see also the
+# `introduction tutorial <../20_basic/introduction_tutorial.html>`_).
+# These file logs are automatically deleted if needed, and use at most 2MB of space.
+#
+# It is possible to configure what log levels to send to console and file.
+# When downloading a dataset from OpenML, a `DEBUG`-level message is written:
+
+import openml
+openml.datasets.get_dataset('iris')
+
+# With default configuration, the above example will show no output to console.
+# However, in your cache directory you should find a file named 'openml_python.log',
+# which has a DEBUG message written to it. It should be either like
+# "[DEBUG] [10:46:19:openml.datasets.dataset] Saved dataset 61: iris to file ..."
+# or like
+# "[DEBUG] [10:49:38:openml.datasets.dataset] Data pickle file already exists and is up to date."
+# , depending on whether or not you had downloaded iris before.
+# The processed log levels can be configured programmatically:
+
+import logging
+openml.config.console_log.setLevel(logging.DEBUG)
+openml.config.file_log.setLevel(logging.WARNING)
+openml.datasets.get_dataset('iris')
+
+# Now the log level that was previously written to file should also be shown in the console.
+# The message is now no longer written to file as the `file_log` was set to level `WARNING`.
+#
+# It is also possible to specify the desired log levels through the configuration file.
+# This way you will not need to set them on each script separately.
+# Add the  line **verbosity = NUMBER** and/or **file_verbosity = NUMBER** to the config file,
+# where 'NUMBER' should be one of:
+#
+# * 0: `logging.WARNING` and up.
+# * 1: `logging.INFO` and up.
+# * 2: `logging.DEBUG` and up (i.e. all messages).
diff --git a/examples/30_extended/run_setup_tutorial.py b/examples/30_extended/run_setup_tutorial.py
@@ -29,16 +29,13 @@
    connects to the test server at test.openml.org. This prevents the main
    server from crowding with example datasets, tasks, runs, and so on.
 """
-import logging
 import numpy as np
 import openml
 import sklearn.ensemble
 import sklearn.impute
 import sklearn.preprocessing
 
 
-root = logging.getLogger()
-root.setLevel(logging.INFO)
 openml.config.start_using_configuration_for_example()
 
 ###############################################################################
diff --git a/examples/40_paper/2018_kdd_rijn_example.py b/examples/40_paper/2018_kdd_rijn_example.py
@@ -15,22 +15,20 @@
 | In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018
 | Available at https://dl.acm.org/citation.cfm?id=3220058
 """
-import json
-import logging
 import sys
 
 if sys.platform == 'win32':  # noqa
-    logging.warning('The pyrfr library (requirement of fanova) can currently not be installed on Windows systems')
+    print('The pyrfr library (requirement of fanova) can currently not be installed on Windows systems')
     exit()
+
+import json
 import fanova
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 
 import openml
 
-root = logging.getLogger()
-root.setLevel(logging.INFO)
 
 ##############################################################################
 # With the advent of automated machine learning, automated hyperparameter
@@ -80,8 +78,8 @@
 for idx, task_id in enumerate(suite.tasks):
     if limit_nr_tasks is not None and idx >= limit_nr_tasks:
         continue
-    logging.info('Starting with task %d (%d/%d)' % (task_id, idx+1,
-                                                    len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks))
+    print('Starting with task %d (%d/%d)'
+          % (task_id, idx+1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks))
     # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop)
     evals = openml.evaluations.list_evaluations_setups(
         evaluation_measure, flow=[flow_id], task=[task_id], size=limit_per_task, output_format='dataframe')
@@ -98,7 +96,7 @@
                                           **{performance_column: setup[performance_column]})
                                      for _, setup in evals.iterrows()])
     except json.decoder.JSONDecodeError as e:
-        logging.warning('Task %d error: %s' % (task_id, e))
+        print('Task %d error: %s' % (task_id, e))
         continue
     # apply our filters, to have only the setups that comply to the hyperparameters we want
     for filter_key, filter_value in parameter_filters.items():
@@ -127,7 +125,7 @@
             # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant
             # for all configurations (there is no variance). We will skip these tasks (like the authors did in the
             # paper).
-            logging.warning('Task %d error: %s' % (task_id, e))
+            print('Task %d error: %s' % (task_id, e))
             continue
 
 # transform ``fanova_results`` from a list of dicts into a DataFrame
diff --git a/openml/config.py b/openml/config.py
@@ -2,23 +2,49 @@
 Store module level information like the API key, cache directory and the server
 """
 import logging
+import logging.handlers
 import os
+from typing import cast
 
 from io import StringIO
 import configparser
 from urllib.parse import urlparse
 
-
 logger = logging.getLogger(__name__)
-logging.basicConfig(
-    format='[%(levelname)s] [%(asctime)s:%(name)s] %('
-           'message)s', datefmt='%H:%M:%S')
 
-# Default values!
+
+def configure_logging(console_output_level: int, file_output_level: int):
+    """ Sets the OpenML logger to DEBUG, with attached Stream- and FileHandler. """
+    # Verbosity levels as defined (https://github.com/openml/OpenML/wiki/Client-API-Standards)
+    # don't match Python values directly:
+    verbosity_map = {0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG}
+
+    openml_logger = logging.getLogger('openml')
+    openml_logger.setLevel(logging.DEBUG)
+    message_format = '[%(levelname)s] [%(asctime)s:%(name)s] %(message)s'
+    output_formatter = logging.Formatter(message_format, datefmt='%H:%M:%S')
+
+    console_stream = logging.StreamHandler()
+    console_stream.setFormatter(output_formatter)
+    console_stream.setLevel(verbosity_map[console_output_level])
+
+    one_mb = 2**20
+    log_path = os.path.join(cache_directory, 'openml_python.log')
+    file_stream = logging.handlers.RotatingFileHandler(log_path, maxBytes=one_mb, backupCount=1)
+    file_stream.setLevel(verbosity_map[file_output_level])
+    file_stream.setFormatter(output_formatter)
+
+    openml_logger.addHandler(console_stream)
+    openml_logger.addHandler(file_stream)
+    return console_stream, file_stream
+
+
+# Default values (see also https://github.com/openml/OpenML/wiki/Client-API-Standards)
 _defaults = {
     'apikey': None,
     'server': "https://www.openml.org/api/v1/xml",
-    'verbosity': 0,
+    'verbosity': 0,  # WARNING
+    'file_verbosity': 2,  # DEBUG
     'cachedir': os.path.expanduser(os.path.join('~', '.openml', 'cache')),
     'avoid_duplicate_runs': 'True',
     'connection_n_retries': 2,
@@ -32,7 +58,7 @@
 server_base_url = server[:-len('/api/v1/xml')]
 apikey = _defaults['apikey']
 # The current cache directory (without the server name)
-cache_directory = _defaults['cachedir']
+cache_directory = str(_defaults['cachedir'])  # so mypy knows it is a string
 avoid_duplicate_runs = True if _defaults['avoid_duplicate_runs'] == 'True' else False
 
 # Number of retries if the connection breaks
@@ -101,19 +127,28 @@ def _setup():
     global cache_directory
     global avoid_duplicate_runs
     global connection_n_retries
+
     # read config file, create cache directory
     try:
         os.mkdir(os.path.expanduser(os.path.join('~', '.openml')))
-    except (IOError, OSError):
-        # TODO add debug information
+    except FileExistsError:
+        # For other errors, we want to propagate the error as openml does not work without cache
         pass
+
     config = _parse_config()
     apikey = config.get('FAKE_SECTION', 'apikey')
     server = config.get('FAKE_SECTION', 'server')
 
     short_cache_dir = config.get('FAKE_SECTION', 'cachedir')
     cache_directory = os.path.expanduser(short_cache_dir)
 
+    # create the cache subdirectory
+    try:
+        os.mkdir(cache_directory)
+    except FileExistsError:
+        # For other errors, we want to propagate the error as openml does not work without cache
+        pass
+
     avoid_duplicate_runs = config.getboolean('FAKE_SECTION',
                                              'avoid_duplicate_runs')
     connection_n_retries = config.get('FAKE_SECTION', 'connection_n_retries')
@@ -147,7 +182,7 @@ def _parse_config():
         config_file_.seek(0)
         config.read_file(config_file_)
     except OSError as e:
-        logging.info("Error opening file %s: %s", config_file, e.message)
+        logger.info("Error opening file %s: %s", config_file, e.message)
     return config
 
 
@@ -204,3 +239,7 @@ def set_cache_directory(cachedir):
 ]
 
 _setup()
+
+_console_log_level = cast(int, _defaults['verbosity'])
+_file_log_level = cast(int, _defaults['file_verbosity'])
+console_log, file_log = configure_logging(_console_log_level, _file_log_level)
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -441,7 +441,7 @@ def _load_data(self):
             with open(self.data_pickle_file, "rb") as fh:
                 data, categorical, attribute_names = pickle.load(fh)
         except EOFError:
-            logging.warning(
+            logger.warning(
                 "Detected a corrupt cache file loading dataset %d: '%s'. "
                 "We will continue loading data from the arff-file, "
                 "but this will be much slower for big datasets. "
@@ -512,7 +512,7 @@ def _encode_if_category(column):
                 return data
         else:
             data_type = "sparse-data" if scipy.sparse.issparse(data) else "non-sparse data"
-            logging.warning(
+            logger.warning(
                 "Cannot convert %s (%s) to '%s'. Returning input data."
                 % (data_type, type(data), array_format)
             )
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
@@ -34,6 +34,8 @@
     OpenMLRegressionTask,
 )
 
+logger = logging.getLogger(__name__)
+
 
 if sys.version_info >= (3, 5):
     from json.decoder import JSONDecodeError
@@ -271,9 +273,8 @@ def _deserialize_sklearn(
         mixed
         """
 
-        logging.info('-%s flow_to_sklearn START o=%s, components=%s, '
-                     'init_defaults=%s' % ('-' * recursion_depth, o, components,
-                                           initialize_with_defaults))
+        logger.info('-%s flow_to_sklearn START o=%s, components=%s, init_defaults=%s'
+                    % ('-' * recursion_depth, o, components, initialize_with_defaults))
         depth_pp = recursion_depth + 1  # shortcut var, depth plus plus
 
         # First, we need to check whether the presented object is a json string.
@@ -376,8 +377,7 @@ def _deserialize_sklearn(
             )
         else:
             raise TypeError(o)
-        logging.info('-%s flow_to_sklearn END   o=%s, rval=%s'
-                     % ('-' * recursion_depth, o, rval))
+        logger.info('-%s flow_to_sklearn END   o=%s, rval=%s' % ('-' * recursion_depth, o, rval))
         return rval
 
     def model_to_flow(self, model: Any) -> 'OpenMLFlow':
@@ -537,16 +537,16 @@ def match_format(s):
                 s = "{}...".format(s[:char_lim - 3])
             return s.strip()
         except ValueError:
-            logging.warning("'Read more' not found in descriptions. "
-                            "Trying to trim till 'Parameters' if available in docstring.")
+            logger.warning("'Read more' not found in descriptions. "
+                           "Trying to trim till 'Parameters' if available in docstring.")
             pass
         try:
             # if 'Read more' doesn't exist, trim till 'Parameters'
             pattern = "Parameters"
             index = s.index(match_format(pattern))
         except ValueError:
             # returning full docstring
-            logging.warning("'Parameters' not found in docstring. Omitting docstring trimming.")
+            logger.warning("'Parameters' not found in docstring. Omitting docstring trimming.")
             index = len(s)
         s = s[:index]
         # trimming docstring to be within char_lim
@@ -580,7 +580,7 @@ def match_format(s):
             index1 = s.index(match_format("Parameters"))
         except ValueError as e:
             # when sklearn docstring has no 'Parameters' section
-            logging.warning("{} {}".format(match_format("Parameters"), e))
+            logger.warning("{} {}".format(match_format("Parameters"), e))
             return None
 
         headings = ["Attributes", "Notes", "See also", "Note", "References"]
@@ -590,7 +590,7 @@ def match_format(s):
                 index2 = s.index(match_format(h))
                 break
             except ValueError:
-                logging.warning("{} not available in docstring".format(h))
+                logger.warning("{} not available in docstring".format(h))
                 continue
         else:
             # in the case only 'Parameters' exist, trim till end of docstring
@@ -975,7 +975,7 @@ def _deserialize_model(
         recursion_depth: int,
         strict_version: bool = True
     ) -> Any:
-        logging.info('-%s deserialize %s' % ('-' * recursion_depth, flow.name))
+        logger.info('-%s deserialize %s' % ('-' * recursion_depth, flow.name))
         model_name = flow.class_name
         self._check_dependencies(flow.dependencies,
                                  strict_version=strict_version)
@@ -993,8 +993,7 @@ def _deserialize_model(
 
         for name in parameters:
             value = parameters.get(name)
-            logging.info('--%s flow_parameter=%s, value=%s' %
-                         ('-' * recursion_depth, name, value))
+            logger.info('--%s flow_parameter=%s, value=%s' % ('-' * recursion_depth, name, value))
             rval = self._deserialize_sklearn(
                 value,
                 components=components_,
@@ -1010,8 +1009,7 @@ def _deserialize_model(
             if name not in components_:
                 continue
             value = components[name]
-            logging.info('--%s flow_component=%s, value=%s'
-                         % ('-' * recursion_depth, name, value))
+            logger.info('--%s flow_component=%s, value=%s' % ('-' * recursion_depth, name, value))
             rval = self._deserialize_sklearn(
                 value,
                 recursion_depth=recursion_depth + 1,
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -37,7 +37,7 @@
 # finding the root directory of conftest.py and going up to OpenML main directory
 # exploiting the fact that conftest.py always resides in the root directory for tests
 static_dir = os.path.dirname(os.path.abspath(__file__))
-logging.info("static directory: {}".format(static_dir))
+logger.info("static directory: {}".format(static_dir))
 print("static directory: {}".format(static_dir))
 while True:
     if 'openml' in os.listdir(static_dir):
@@ -178,4 +178,4 @@ def pytest_sessionfinish() -> None:
         compare_delete_files(file_list, new_file_list)
         logger.info("Local files deleted")
 
-    logging.info("{} is killed".format(worker))
+    logger.info("{} is killed".format(worker))