diff --git a/.travis.yml b/.travis.yml index 7e6dca31..fba26644 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,12 +13,18 @@ matrix: env: TESTS=blocks FLOATX=float32 - python: 2.7 env: TESTS=blocks FLOATX=float32 DB=sqlite + - python: 2.7 + env: TESTS=blocks FLOATX=float32 DB=mimir - python: 3.4 env: TESTS=blocks FLOATX=float64 + - python: 2.7 + env: TESTS=blocks FLOATX=float64 DB=mimir - python: 2.7 env: TESTS=blocks-examples FLOATX=float32 - python: 2.7 env: TESTS=blocks-examples FLOATX=float32 DB=sqlite + - python: 2.7 + env: TESTS=blocks-examples FLOATX=float32 DB=mimir - python: 3.4 env: TESTS=blocks-examples FLOATX=float64 before_install: @@ -40,6 +46,10 @@ script: - export THEANO_FLAGS=floatX=$FLOATX,optimizer=fast_compile - export FUEL_FLOATX=$FLOATX - "if [[ $DB == 'sqlite' ]]; then echo 'log_backend: sqlite' > ~/.blocksrc; fi" + - | + if [[ $DB == 'mimir' ]]; then + echo 'log_backend: mimir' > ~/.blocksrc; + fi - # Running nose2 within coverage makes imports count towards coverage - function fail { export FAILED=1; } - | diff --git a/blocks/config.py b/blocks/config.py index d5e38ba0..5ee6b5db 100644 --- a/blocks/config.py +++ b/blocks/config.py @@ -48,8 +48,13 @@ .. option:: log_backend The backend to use for logging experiments. Defaults to `python`, which - stores the log as a Python object in memory. The other option is - `sqlite`. + stores the log as a Python object in memory. The other options are + `sqlite` and `mimir`. + +.. option:: log_arguments + + The arguments to pass to the logger. Defaults to an empty dictionary. + This option can be used to configure `:class:~.log.json.JSONLinesLog`. .. option:: sqlite_database, BLOCKS_SQLITEDB @@ -179,6 +184,7 @@ def str_or_none(val): config.add_config('profile', type_=bool_, default=False, env_var='BLOCKS_PROFILE') config.add_config('log_backend', type_=str, default='python') +config.add_config('log_arguments', type_=dict, default={}) config.add_config('sqlite_database', type_=str, default=os.path.expanduser('~/blocks_log.sqlite'), env_var='BLOCKS_SQLITEDB') diff --git a/blocks/log/__init__.py b/blocks/log/__init__.py index 0c87bed8..e4158060 100644 --- a/blocks/log/__init__.py +++ b/blocks/log/__init__.py @@ -1,7 +1,9 @@ from .log import TrainingLog from .sqlite import SQLiteLog +from .json import JSONLinesLog BACKENDS = { 'python': TrainingLog, - 'sqlite': SQLiteLog + 'sqlite': SQLiteLog, + 'mimir': JSONLinesLog } diff --git a/blocks/log/json.py b/blocks/log/json.py new file mode 100644 index 00000000..193b0a06 --- /dev/null +++ b/blocks/log/json.py @@ -0,0 +1,133 @@ +import os.path +from collections import deque +from six.moves import range +from mimir import Logger +from mimir.logger import _Logger + +from .log import TrainingLogBase + + +class PicklableLogger(_Logger): + """A picklable wrapper around mimir logger. + + This class is a picklable version of `:class:mimir.Logger`. + + """ + def __init__(self, **kwargs): + self.logger_kwargs = kwargs + self.opened = False + + def open(self): + if not self.opened: + logger = Logger(**self.logger_kwargs) + self.__dict__.update(logger.__dict__) + self.load(self.logger_kwargs['filename']) + self.opened = True + + def close(self): + self.opened = False + super(PicklableLogger, self).close() + + def __setstate__(self, state): + self.logger_kwargs = state + self.opened = False + self.open() + + def __getstate__(self): + return self.logger_kwargs + + +class JSONLinesLog(TrainingLogBase): + """A log stored in gzipped JSON Lines format. + + Each line of the log is a dictionary of a form + `{: {: ...}}`. + + Examples + -------- + + Analysis of the log can be easily done with + `jq `__ + + .. code:: bash + gunzip -c log.jsonl.gz | jq '.reports.train_error' + + # Or equivalently + zcat log.jsonl.gz | jq '.reports.train_error' + + # To filter out null entires + zcat log.jsnol.gz | jq '.reports.train_error | select(.>0)' + + # To extract minimal training error + gunzip -c log.jsonl.gz | jq -s '. | map(.reports.true_cost) | min' + + # To include the iteration with minimal training error + gunzip -c log.jsonl.gz | jq -s '. | + map([.iterations_done, .reports.true_cost]) | min_by(.[1])' + + """ + def __init__(self, filename='log.jsonl.gz', maxlen=101, formatter=None, + **kwargs): + self.status = {} + super(JSONLinesLog, self).__init__() + if os.path.isfile(filename): + os.remove(filename) + self.logger = PicklableLogger( + filename=filename, maxlen=maxlen, formatter=formatter, **kwargs) + self.local_cache = deque() + + def flush(self, iterations_done): + if iterations_done < 0: + raise ValueError + if len(self.local_cache) > 0: + self.logger.log({'iterations_done': iterations_done, + 'reports': self.local_cache.popleft()}) + + def __getitem__(self, time): + self._check_time(time) + logger_len = self.inner_logger_len() + total_length = logger_len + len(self.local_cache) + + # Flush local cache + while len(self.local_cache) > 1: + self.flush(total_length - len(self.local_cache)) + logger_len = self.inner_logger_len() + + if time >= total_length: + # Need to create new item in local cache + self.local_cache.extend( + [{} for _ in range(time - total_length + 1)]) + if time < logger_len: + try: + if not self.logger[time]['iterations_done'] == time: + raise ValueError('iterations done') + return self.logger[time]['reports'] + except IndexError: + raise ValueError( + 'cannot get past log entries for JSON log, max log length ' + 'in memory is: {}'.format( + self.logger.logger_kwargs['maxlen'])) + if time >= logger_len: + return self.local_cache[time - logger_len] + + def inner_logger_len(self): + try: + return len(self.logger) + except AttributeError: + return 0 + + def __len__(self): + return self.inner_logger_len() + len(self.local_cache) + + def __setitem__(self, time, value): + raise ValueError('cannot manually change JSON Lines log') + + def __enter__(self): + self.logger.open() + + def __exit__(self, exc_type, exc_val, exc_tb): + self.flush(self.status.get('iterations_done', -1)) + self.logger.close() + + def __iter__(self): + return iter([self[i] for i in range(len(self))]) diff --git a/blocks/log/log.py b/blocks/log/log.py index d2af257c..55fcbe78 100644 --- a/blocks/log/log.py +++ b/blocks/log/log.py @@ -63,6 +63,12 @@ def __init__(self, uuid=None): 'resumed_from': None }) + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + pass + @property def h_uuid(self): """Return a hexadecimal version of the UUID bytes. diff --git a/blocks/main_loop.py b/blocks/main_loop.py index b519a254..73562918 100644 --- a/blocks/main_loop.py +++ b/blocks/main_loop.py @@ -96,7 +96,7 @@ def __init__(self, algorithm, data_stream, model=None, log=None, if log is None: if log_backend is None: log_backend = config.log_backend - log = BACKENDS[log_backend]() + log = BACKENDS[log_backend](**config.log_arguments) if extensions is None: extensions = [] @@ -157,7 +157,7 @@ def run(self): set(self.algorithm.parameters)): logger.warning("different parameters for model and algorithm") - with change_recursion_limit(config.recursion_limit): + with change_recursion_limit(config.recursion_limit), self.log: self.original_sigint_handler = signal.signal( signal.SIGINT, self._handle_epoch_interrupt) self.original_sigterm_handler = signal.signal( diff --git a/docs/api/log.rst b/docs/api/log.rst index 916f46bd..b9b2a219 100644 --- a/docs/api/log.rst +++ b/docs/api/log.rst @@ -19,6 +19,14 @@ Dictionary backend :undoc-members: :show-inheritance: +JSON Lines backend +------------------ + +.. automodule:: blocks.log.json + :members: + :undoc-members: + :show-inheritance: + Sqlite backend -------------- diff --git a/req-travis-pip.txt b/req-travis-pip.txt index 7cf702bf..9ed77e00 100644 --- a/req-travis-pip.txt +++ b/req-travis-pip.txt @@ -10,3 +10,7 @@ git+https://github.com/Theano/Theano.git#egg=theano --allow-external fuel --allow-unverified fuel git+https://github.com/mila-udem/fuel#egg=fuel + +--allow-external mimir +--allow-unverified mimir +git+https://github.com/bartvm/mimir#egg=mimir diff --git a/requirements.txt b/requirements.txt index 2d9f5183..3c1fcae4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,7 @@ git+https://github.com/Theano/Theano.git#egg=theano --allow-external fuel --allow-unverified fuel git+https://github.com/mila-udem/fuel#egg=fuel + +--allow-external mimir +--allow-unverified mimir +git+https://github.com/bartvm/mimir#egg=mimir diff --git a/tests/extensions/test_saveload.py b/tests/extensions/test_saveload.py index d25bf4f4..49e24221 100644 --- a/tests/extensions/test_saveload.py +++ b/tests/extensions/test_saveload.py @@ -52,8 +52,9 @@ def test_save_and_load(self): algorithm=self.algorithm, extensions=[Load('myweirdmodel.tar')] ) - new_main_loop.extensions[0].main_loop = new_main_loop - new_main_loop._run_extensions('before_training') + with new_main_loop.log: + new_main_loop.extensions[0].main_loop = new_main_loop + new_main_loop._run_extensions('before_training') assert_allclose(self.W.get_value(), old_value) def test_load_log_and_iteration_state(self): @@ -68,6 +69,9 @@ def test_load_log_and_iteration_state(self): ) new_main_loop.extensions[0].main_loop = new_main_loop new_main_loop._run_extensions('before_training') + with new_main_loop.log: + # Open and close log + pass # Check the log new_keys = sorted(new_main_loop.log.status.keys()) old_keys = sorted(self.main_loop.log.status.keys()) diff --git a/tests/extensions/test_training.py b/tests/extensions/test_training.py index 6b216ecc..d75b09b4 100644 --- a/tests/extensions/test_training.py +++ b/tests/extensions/test_training.py @@ -90,33 +90,34 @@ def test_track_the_best(): extension = TrackTheBest("cost") extension.main_loop = main_loop - main_loop.status['epochs_done'] += 1 - main_loop.status['iterations_done'] += 10 - main_loop.log.current_row['cost'] = 5 - extension.dispatch('after_epoch') - assert main_loop.status['best_cost'] == 5 - assert main_loop.log.current_row['cost_best_so_far'] - - main_loop.status['epochs_done'] += 1 - main_loop.status['iterations_done'] += 10 - main_loop.log.current_row['cost'] = 6 - extension.dispatch('after_epoch') - assert main_loop.status['best_cost'] == 5 - assert main_loop.log.current_row.get('cost_best_so_far', None) is None - - main_loop.status['epochs_done'] += 1 - main_loop.status['iterations_done'] += 10 - main_loop.log.current_row['cost'] = 5 - extension.dispatch('after_epoch') - assert main_loop.status['best_cost'] == 5 - assert main_loop.log.current_row.get('cost_best_so_far', None) is None - - main_loop.status['epochs_done'] += 1 - main_loop.status['iterations_done'] += 10 - main_loop.log.current_row['cost'] = 4 - extension.dispatch('after_epoch') - assert main_loop.status['best_cost'] == 4 - assert main_loop.log.current_row['cost_best_so_far'] + with main_loop.log: + main_loop.status['epochs_done'] += 1 + main_loop.status['iterations_done'] += 10 + main_loop.log.current_row['cost'] = 5 + extension.dispatch('after_epoch') + assert main_loop.status['best_cost'] == 5 + assert main_loop.log.current_row['cost_best_so_far'] + + main_loop.status['epochs_done'] += 1 + main_loop.status['iterations_done'] += 10 + main_loop.log.current_row['cost'] = 6 + extension.dispatch('after_epoch') + assert main_loop.status['best_cost'] == 5 + assert main_loop.log.current_row.get('cost_best_so_far', None) is None + + main_loop.status['epochs_done'] += 1 + main_loop.status['iterations_done'] += 10 + main_loop.log.current_row['cost'] = 5 + extension.dispatch('after_epoch') + assert main_loop.status['best_cost'] == 5 + assert main_loop.log.current_row.get('cost_best_so_far', None) is None + + main_loop.status['epochs_done'] += 1 + main_loop.status['iterations_done'] += 10 + main_loop.log.current_row['cost'] = 4 + extension.dispatch('after_epoch') + assert main_loop.status['best_cost'] == 4 + assert main_loop.log.current_row['cost_best_so_far'] class WriteCostExtension(TrainingExtension): diff --git a/tests/test_log.py b/tests/test_log.py index b26114d6..3251b983 100644 --- a/tests/test_log.py +++ b/tests/test_log.py @@ -3,30 +3,43 @@ from numpy.testing import assert_raises -from blocks.log import TrainingLog +from blocks.log import TrainingLog, JSONLinesLog from blocks.serialization import load, dump -def test_training_log(): - log = TrainingLog() +def run_log(log): + with log: + # test basic writing capabilities + log[0]['field'] = 45 + assert log[0]['field'] == 45 + assert log[1] == {} + assert log.current_row['field'] == 45 + log.status['iterations_done'] += 1 + assert log.status['iterations_done'] == 1 + assert log.previous_row['field'] == 45 + + assert_raises(ValueError, getitem, log, -1) + + # test iteration + assert len(list(log)) == 2 + - # test basic writing capabilities - log[0]['field'] = 45 - assert log[0]['field'] == 45 - assert log[1] == {} - assert log.current_row['field'] == 45 - log.status['iterations_done'] += 1 - assert log.status['iterations_done'] == 1 - assert log.previous_row['field'] == 45 +def test_json_lines_log(): + log = JSONLinesLog(maxlen=2) + run_log(log) - assert_raises(ValueError, getitem, log, -1) - # test iteration - assert len(list(log)) == 2 +def test_training_log(): + log = TrainingLog() + run_log(log) def test_pickle_log(): - log1 = TrainingLog() + log = TrainingLog() + pickle_log(log) + + +def pickle_log(log1): with open('log1.tar', 'wb') as f: dump(log1, f) with open('log1.tar', 'rb') as f: diff --git a/tests/test_main_loop.py b/tests/test_main_loop.py index 78b8f5c3..b425efd3 100644 --- a/tests/test_main_loop.py +++ b/tests/test_main_loop.py @@ -9,6 +9,7 @@ from numpy.testing import assert_raises from six.moves import cPickle +from blocks.log import TrainingLog from blocks.main_loop import MainLoop from blocks.extensions import TrainingExtension, FinishAfter, Printing from blocks.utils import unpack @@ -34,8 +35,8 @@ def test_main_loop(): assert_raises(AttributeError, getattr, main_loop, 'model') assert main_loop.log.status['iterations_done'] == 20 - assert main_loop.log.status['_epoch_ends'] == [10, 20] - assert len(main_loop.log) == 20 + if type(main_loop.log) == TrainingLog: + assert len(main_loop.log) == 20 for i in range(20): assert main_loop.log[i + 1]['batch'] == {'data': i % 10}