From 53aa5636cf169572bc05329ca41663b50d34f214 Mon Sep 17 00:00:00 2001 From: Oliver Neumann Date: Thu, 30 Apr 2020 13:54:50 +0200 Subject: [PATCH 01/43] Fixed broken link in PR template (#1675) * Fixed broken link in PR template. * Updated CHANGELOG.md --- .github/PULL_REQUEST_TEMPLATE.md | 2 +- CHANGELOG.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index af80acf6a6390..0bda363228b1c 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -4,7 +4,7 @@ - [ ] Did you read the [contributor guideline](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/.github/CONTRIBUTING.md), Pull Request section? - [ ] Did you make sure to update the docs? - [ ] Did you write any new necessary tests? -- [ ] If you made a notable change (that affects users), did you update the [CHANGELOG](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/.github/CHANGELOG.md)? +- [ ] If you made a notable change (that affects users), did you update the [CHANGELOG](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/CHANGELOG.md)? diff --git a/CHANGELOG.md b/CHANGELOG.md index 9750c6566b769..85edc73864efd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Fixed broken link in PR template ([#1675](https://github.com/PyTorchLightning/pytorch-lightning/pull/1675)) - Fixed ModelCheckpoint not None checking filepath ([1654](https://github.com/PyTorchLightning/pytorch-lightning/pull/1654)) From 8d564b5e38d1a1f820304a27f2d615d8bd4f401d Mon Sep 17 00:00:00 2001 From: Peter Yu <2057325+yukw777@users.noreply.github.com> Date: Thu, 30 Apr 2020 07:57:24 -0400 Subject: [PATCH 02/43] call on_load_checkpoint() when resuming from checkpoint (#1666) --- CHANGELOG.md | 1 + pytorch_lightning/trainer/training_io.py | 4 ++++ tests/trainer/test_trainer.py | 15 +++++++++++---- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 85edc73864efd..10ec061f18b2b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed broken link in PR template ([#1675](https://github.com/PyTorchLightning/pytorch-lightning/pull/1675)) - Fixed ModelCheckpoint not None checking filepath ([1654](https://github.com/PyTorchLightning/pytorch-lightning/pull/1654)) +- Trainer now calls `on_load_checkpoint()` when resuming from a checkpoint ([1666](https://github.com/PyTorchLightning/pytorch-lightning/pull/1666)) ## [0.7.5] - 2020-04-27 diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py index 82bc0829aa238..393d6540398b7 100644 --- a/pytorch_lightning/trainer/training_io.py +++ b/pytorch_lightning/trainer/training_io.py @@ -278,6 +278,10 @@ def restore(self, checkpoint_path: str, on_gpu: bool): # load the state_dict on the model automatically model.load_state_dict(checkpoint['state_dict']) + + # give model a chance to load something + model.on_load_checkpoint(checkpoint) + if on_gpu: model.cuda(self.root_gpu) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 18cc2586a0b9f..cb650fd87e4c4 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -309,8 +309,8 @@ def test_model_freeze_unfreeze(): model.unfreeze() -def test_resume_from_checkpoint_epoch_restored(tmpdir): - """Verify resuming from checkpoint runs the right number of epochs""" +def test_resume_from_checkpoint(tmpdir): + """Verify resuming from checkpoint (epoch, batch numbers and on_load_checkpoint())""" import types tutils.reset_seed() @@ -322,6 +322,7 @@ def _new_model(): model = LightningTestModel(hparams) model.num_epochs_seen = 0 model.num_batches_seen = 0 + model.num_on_load_checkpoint_called = 0 def increment_epoch(self): self.num_epochs_seen += 1 @@ -329,10 +330,14 @@ def increment_epoch(self): def increment_batch(self, _): self.num_batches_seen += 1 - # Bind the increment_epoch function on_epoch_end so that the - # model keeps track of the number of epochs it has seen. + def increment_on_load_checkpoint(self, _): + self.num_on_load_checkpoint_called += 1 + + # Bind methods to keep track of epoch numbers, batch numbers it has seen + # as well as number of times it has called on_load_checkpoint() model.on_epoch_end = types.MethodType(increment_epoch, model) model.on_batch_start = types.MethodType(increment_batch, model) + model.on_load_checkpoint = types.MethodType(increment_on_load_checkpoint, model) return model model = _new_model() @@ -356,6 +361,7 @@ def increment_batch(self, _): assert model.num_epochs_seen == 2 assert model.num_batches_seen == training_batches * 2 + assert model.num_on_load_checkpoint_called == 0 # Other checkpoints can be uncommented if/when resuming mid-epoch is supported checkpoints = sorted(glob.glob(os.path.join(trainer.checkpoint_callback.dirpath, '*.ckpt'))) @@ -369,6 +375,7 @@ def increment_batch(self, _): new_trainer = Trainer(**trainer_options, resume_from_checkpoint=check) new_trainer.fit(next_model) assert state['global_step'] + next_model.num_batches_seen == training_batches * trainer_options['max_epochs'] + assert next_model.num_on_load_checkpoint_called == 1 def _init_steps_model(): From f9c9e39ab87e393c157a30aa659be71eef11190e Mon Sep 17 00:00:00 2001 From: Jacob Zhong Date: Thu, 30 Apr 2020 07:58:03 -0400 Subject: [PATCH 03/43] Add log output for slurm (#1657) * add log output for slurm * change log levels * formatting Co-authored-by: Jirka Borovec --- pytorch_lightning/core/lightning.py | 2 ++ pytorch_lightning/trainer/distrib_data_parallel.py | 6 +++++- pytorch_lightning/trainer/training_io.py | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index a1f3eb4e9252c..26016613c6369 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -930,10 +930,12 @@ def init_ddp_connection( if 'MASTER_ADDR' not in os.environ: log.warning("MASTER_ADDR environment variable is not defined. Set as localhost") os.environ['MASTER_ADDR'] = '127.0.0.1' + log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}") if 'MASTER_PORT' not in os.environ: log.warning("MASTER_PORT environment variable is not defined. Set as 12910") os.environ['MASTER_PORT'] = '12910' + log.debug(f"MASTER_PORT: {os.environ['MASTER_PORT']}") if 'WORLD_SIZE' in os.environ and os.environ['WORLD_SIZE'] != world_size: log.warning("WORLD_SIZE environment variable is not equal to the computed " diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 659aa7a072f9a..56c7bae8ec6a7 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -277,6 +277,10 @@ def configure_slurm_ddp(self, num_gpu_nodes): except Exception as e: pass + # notify user the that slurm is managing tasks + if self.is_slurm_managing_tasks: + log.info('Multi-processing is handled by Slurm.') + def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids): if data_parallel_device_ids is None: return @@ -293,7 +297,7 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids): gpu_str = ','.join([str(x) for x in data_parallel_device_ids]) os.environ["CUDA_VISIBLE_DEVICES"] = gpu_str - log.info(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]') + log.debug(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]') def ddp_train(self, process_idx, model): """ diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py index 393d6540398b7..e49329538fcf5 100644 --- a/pytorch_lightning/trainer/training_io.py +++ b/pytorch_lightning/trainer/training_io.py @@ -215,7 +215,7 @@ def sig_handler(self, signum, frame): # pragma: no-cover if result == 0: log.info(f'requeued exp {job_id}') else: - log.info('requeue failed...') + log.warning('requeue failed...') # close experiment to avoid issues self.logger.close() From 2ec8d61e94722f7ecc97e1add72f4ac693d2f612 Mon Sep 17 00:00:00 2001 From: weipengOO98 <63845580+weipengOO98@users.noreply.github.com> Date: Thu, 30 Apr 2020 19:58:42 +0800 Subject: [PATCH 04/43] Update new-project.rst (#1655) fix a typo --- docs/source/new-project.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/new-project.rst b/docs/source/new-project.rst index 7d81ba44a352f..e3f3a892d983f 100644 --- a/docs/source/new-project.rst +++ b/docs/source/new-project.rst @@ -100,7 +100,7 @@ To also add a validation loop add the following functions def validation_epoch_end(self, outputs): avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() tensorboard_logs = {'val_loss': avg_loss} - return {'val_loss': avg_loss, 'log': tensorboard_logs + return {'val_loss': avg_loss, 'log': tensorboard_logs} def val_dataloader(self): # TODO: do a real train/val split From d40425d2574c5698eed350e340d9ece779a68ac2 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 30 Apr 2020 08:04:18 -0400 Subject: [PATCH 05/43] added warning to crash (#1625) * added warning to crash * formatting Co-authored-by: J. Borovec --- pytorch_lightning/core/lightning.py | 6 +++--- pytorch_lightning/trainer/training_io.py | 8 ++++++-- setup.cfg | 1 + 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 26016613c6369..fc88fb8c78687 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1162,9 +1162,9 @@ def optimizer_step(self, current_epoch, batch_idx, optimizer, # native amp + lbfgs is a no go right now if self.trainer.use_amp and self.trainer.use_native_amp: - m = 'native PyTorch amp and lbfgs are not compatible. To request, please file' \ - 'a Github issue in PyTorch and tag @mcarilli' - raise MisconfigurationException(m) + raise MisconfigurationException( + 'native PyTorch amp and lbfgs are not compatible.' + ' To request, please file a Github issue in PyTorch and tag @mcarilli') optimizer.step(second_order_closure) else: if self.trainer.use_amp and self.trainer.use_native_amp: diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py index e49329538fcf5..78d24fad0a18f 100644 --- a/pytorch_lightning/trainer/training_io.py +++ b/pytorch_lightning/trainer/training_io.py @@ -251,9 +251,11 @@ def save_checkpoint(self, filepath): # do the actual save try: self._atomic_save(checkpoint, filepath) - except AttributeError: + except AttributeError as e: if 'hparams' in checkpoint: del checkpoint['hparams'] + rank_zero_warn('warning, `hparams` dropped from checkpoint.' + f' An attribute is not picklable {e}') self._atomic_save(checkpoint, filepath) @@ -434,9 +436,11 @@ def hpc_save(self, folderpath: str, logger): # TODO: fix for anything with multiprocess DP, DDP, DDP2 try: self._atomic_save(checkpoint, filepath) - except AttributeError: + except AttributeError as e: if 'hparams' in checkpoint: del checkpoint['hparams'] + rank_zero_warn('warning, `hparams` dropped from checkpoint.' + f' An attribute is not picklable {e}') self._atomic_save(checkpoint, filepath) diff --git a/setup.cfg b/setup.cfg index 2f1b55c1894b4..aab7a580c77b9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -18,6 +18,7 @@ exclude_lines = pragma: no-cover warnings pass + rank_zero_warn [flake8] # TODO: this should be 88 or 100 according PEP8 From 3eac6cfd4fbbc4d13f4e93f6d90f8ee5302c421e Mon Sep 17 00:00:00 2001 From: Nathan Breitsch Date: Thu, 30 Apr 2020 08:04:50 -0400 Subject: [PATCH 06/43] Don't convert namedtuple to tuple (#1589) * Don't convert namedtuple to tuple * Test namedtuples sent to device correctly --- pytorch_lightning/trainer/distrib_parts.py | 13 +++++++++---- tests/models/test_cpu.py | 8 ++++++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index 73efaf67c486b..db4e132c0b445 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -461,10 +461,15 @@ def __transfer_data_to_device(self, batch, device, gpu_id=None): # when tuple if isinstance(batch, tuple): - batch = list(batch) - for i, x in enumerate(batch): - batch[i] = self.__transfer_data_to_device(x, device, gpu_id) - return tuple(batch) + # when namedtuple + if hasattr(batch, '_fields'): + elem_type = type(batch) + return elem_type(*(self.__transfer_data_to_device(x, device, gpu_id) for x in batch)) + else: + batch = list(batch) + for i, x in enumerate(batch): + batch[i] = self.__transfer_data_to_device(x, device, gpu_id) + return tuple(batch) # when dict if isinstance(batch, dict): diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 612286404041b..eb3b28769e206 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -1,3 +1,4 @@ +from collections import namedtuple import platform import pytest @@ -221,6 +222,13 @@ def test_single_gpu_batch_parse(): assert batch[1][0]['b'].device.index == 0 assert batch[1][0]['b'].type() == 'torch.cuda.FloatTensor' + # namedtuple of tensor + BatchType = namedtuple('BatchType', ['a', 'b']) + batch = [BatchType(a=torch.rand(2, 3), b=torch.rand(2, 3)) for _ in range(2)] + batch = trainer.transfer_batch_to_gpu(batch, 0) + assert batch[0].a.device.index == 0 + assert batch[0].a.type() == 'torch.cuda.FloatTensor' + def test_simple_cpu(tmpdir): """Verify continue training session on CPU.""" From 142bc0230e228cd2e851481e5a07069e7d198655 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Thu, 30 Apr 2020 14:06:41 +0200 Subject: [PATCH 07/43] Learning rate log callback (#1498) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * base implementation * docs + implementation * fix styling * add lr string * renaming * CHANGELOG.md * add tests * Apply suggestions from code review Co-Authored-By: Adrian Wälchli * Apply suggestions from code review * Update pytorch_lightning/callbacks/lr_logger.py * Update pytorch_lightning/callbacks/lr_logger.py * add test for naming * base implementation * docs + implementation * fix styling * add lr string * renaming * CHANGELOG.md * add tests * Apply suggestions from code review Co-Authored-By: Adrian Wälchli * Apply suggestions from code review * Update pytorch_lightning/callbacks/lr_logger.py * Update pytorch_lightning/callbacks/lr_logger.py * add test for naming * Update pytorch_lightning/callbacks/lr_logger.py Co-Authored-By: Adrian Wälchli * suggestions from code review * fix styling * rebase * fix tests Co-authored-by: Nicki Skafte Co-authored-by: Jirka Borovec Co-authored-by: Adrian Wälchli --- CHANGELOG.md | 2 + docs/source/callbacks.rst | 8 ++ pytorch_lightning/callbacks/__init__.py | 2 + pytorch_lightning/callbacks/lr_logger.py | 118 +++++++++++++++++++++++ tests/callbacks/test_callbacks.py | 59 +++++++++++- 5 files changed, 188 insertions(+), 1 deletion(-) create mode 100755 pytorch_lightning/callbacks/lr_logger.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 10ec061f18b2b..f67e85a452ff9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added +- Added callback for logging learning rates ([#1498](https://github.com/PyTorchLightning/pytorch-lightning/pull/1498)) + ### Changed ### Deprecated diff --git a/docs/source/callbacks.rst b/docs/source/callbacks.rst index 10323472facd8..a2969820b2eeb 100644 --- a/docs/source/callbacks.rst +++ b/docs/source/callbacks.rst @@ -84,3 +84,11 @@ We successfully extended functionality without polluting our super clean .. automodule:: pytorch_lightning.callbacks.progress :noindex: :exclude-members: + +--------- + +.. automodule:: pytorch_lightning.callbacks.lr_logger + :noindex: + :exclude-members: + _extract_lr, + _find_names \ No newline at end of file diff --git a/pytorch_lightning/callbacks/__init__.py b/pytorch_lightning/callbacks/__init__.py index c232060ca4ecb..7e8e0ce5bcfef 100644 --- a/pytorch_lightning/callbacks/__init__.py +++ b/pytorch_lightning/callbacks/__init__.py @@ -2,6 +2,7 @@ from pytorch_lightning.callbacks.early_stopping import EarlyStopping from pytorch_lightning.callbacks.gradient_accumulation_scheduler import GradientAccumulationScheduler from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint +from pytorch_lightning.callbacks.lr_logger import LearningRateLogger from pytorch_lightning.callbacks.progress import ProgressBarBase, ProgressBar __all__ = [ @@ -9,6 +10,7 @@ 'EarlyStopping', 'ModelCheckpoint', 'GradientAccumulationScheduler', + 'LearningRateLogger', 'ProgressBarBase', 'ProgressBar', ] diff --git a/pytorch_lightning/callbacks/lr_logger.py b/pytorch_lightning/callbacks/lr_logger.py new file mode 100755 index 0000000000000..6ad68905bc341 --- /dev/null +++ b/pytorch_lightning/callbacks/lr_logger.py @@ -0,0 +1,118 @@ +r""" + +Logging of learning rates +========================= + +Log learning rate for lr schedulers during training + +""" + +from pytorch_lightning.callbacks.base import Callback +from pytorch_lightning.utilities.exceptions import MisconfigurationException + + +class LearningRateLogger(Callback): + r""" + Automatically logs learning rate for learning rate schedulers during training. + + Example:: + + >>> from pytorch_lightning import Trainer + >>> from pytorch_lightning.callbacks import LearningRateLogger + >>> lr_logger = LearningRateLogger() + >>> trainer = Trainer(callbacks=[lr_logger]) + + Logging names are automatically determined based on optimizer class name. + In case of multiple optimizers of same type, they will be named `Adam`, + `Adam-1` etc. If a optimizer has multiple parameter groups they will + be named `Adam/pg1`, `Adam/pg2` etc. To control naming, pass in a + `name` keyword in the construction of the learning rate schdulers + + Example:: + + def configure_optimizer(self): + optimizer = torch.optim.Adam(...) + lr_scheduler = {'scheduler': torch.optim.lr_schedulers.LambdaLR(optimizer, ...) + 'name': 'my_logging_name'} + return [optimizer], [lr_scheduler] + """ + def __init__(self): + self.lrs = None + self.lr_sch_names = [] + + def on_train_start(self, trainer, pl_module): + """ Called before training, determines unique names for all lr + schedulers in the case of multiple of the same type or in + the case of multiple parameter groups + """ + if trainer.lr_schedulers == []: + raise MisconfigurationException( + 'Cannot use LearningRateLogger callback with models that have no' + ' learning rate schedulers. Please see documentation for' + ' `configure_optimizers` method.') + + if not trainer.logger: + raise MisconfigurationException( + 'Cannot use LearningRateLogger callback with Trainer that has no logger.') + + # Find names for schedulers + names = self._find_names(trainer.lr_schedulers) + + # Initialize for storing values + self.lrs = dict.fromkeys(names, []) + + def on_batch_start(self, trainer, pl_module): + latest_stat = self._extract_lr(trainer, 'step') + if trainer.logger and latest_stat: + trainer.logger.log_metrics(latest_stat, step=trainer.global_step) + + def on_epoch_start(self, trainer, pl_module): + latest_stat = self._extract_lr(trainer, 'epoch') + if trainer.logger and latest_stat: + trainer.logger.log_metrics(latest_stat, step=trainer.global_step) + + def _extract_lr(self, trainer, interval): + """ Extracts learning rates for lr schedulers and saves information + into dict structure. """ + latest_stat = {} + for name, scheduler in zip(self.lr_sch_names, trainer.lr_schedulers): + if scheduler['interval'] == interval: + param_groups = scheduler['scheduler'].optimizer.param_groups + if len(param_groups) != 1: + for i, pg in enumerate(param_groups): + lr, key = pg['lr'], f'{name}/{i + 1}' + self.lrs[key].append(lr) + latest_stat[key] = lr + else: + self.lrs[name].append(param_groups[0]['lr']) + latest_stat[name] = param_groups[0]['lr'] + return latest_stat + + def _find_names(self, lr_schedulers): + # Create uniqe names in the case we have multiple of the same learning + # rate schduler + multiple parameter groups + names = [] + for scheduler in lr_schedulers: + sch = scheduler['scheduler'] + if 'name' in scheduler: + name = scheduler['name'] + else: + opt_name = 'lr-' + sch.optimizer.__class__.__name__ + i, name = 1, opt_name + # Multiple schduler of the same type + while True: + if name not in names: + break + i, name = i + 1, f'{opt_name}-{i}' + + # Multiple param groups for the same schduler + param_groups = sch.optimizer.param_groups + if len(param_groups) != 1: + for i, pg in enumerate(param_groups): + temp = name + '/pg' + str(i + 1) + names.append(temp) + else: + names.append(name) + + self.lr_sch_names.append(name) + return names diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index 9dba21eab07d8..a082c5ec6f1a6 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -2,11 +2,12 @@ import tests.base.utils as tutils from pytorch_lightning import Callback from pytorch_lightning import Trainer, LightningModule -from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint +from pytorch_lightning.callbacks import EarlyStopping, LearningRateLogger, ModelCheckpoint from tests.base import ( LightTrainDataloader, LightTestMixin, LightValidationMixin, + LightTestOptimizersWithMixedSchedulingMixin, TestModelBase ) @@ -273,3 +274,59 @@ class CurrentTestModel(LightTrainDataloader, TestModelBase): # These should be different if the dirpath has be overridden assert trainer.ckpt_path != trainer.default_root_dir + + +def test_lr_logger_single_lr(tmpdir): + """ Test that learning rates are extracted and logged for single lr scheduler""" + tutils.reset_seed() + + class CurrentTestModel(LightTrainDataloader, TestModelBase): + pass + + hparams = tutils.get_default_hparams() + model = CurrentTestModel(hparams) + + lr_logger = LearningRateLogger() + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=5, + val_percent_check=0.1, + train_percent_check=0.5, + callbacks=[lr_logger] + ) + results = trainer.fit(model) + + assert lr_logger.lrs, 'No learning rates logged' + assert len(lr_logger.lrs) == len(trainer.lr_schedulers), \ + 'Number of learning rates logged does not match number of lr schedulers' + assert all([k in ['lr-Adam'] for k in lr_logger.lrs.keys()]), \ + 'Names of learning rates not set correctly' + + +def test_lr_logger_multi_lrs(tmpdir): + """ Test that learning rates are extracted and logged for multi lr schedulers """ + tutils.reset_seed() + + class CurrentTestModel(LightTestOptimizersWithMixedSchedulingMixin, + LightTrainDataloader, + TestModelBase): + pass + + hparams = tutils.get_default_hparams() + model = CurrentTestModel(hparams) + + lr_logger = LearningRateLogger() + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + val_percent_check=0.1, + train_percent_check=0.5, + callbacks=[lr_logger] + ) + results = trainer.fit(model) + + assert lr_logger.lrs, 'No learning rates logged' + assert len(lr_logger.lrs) == len(trainer.lr_schedulers), \ + 'Number of learning rates logged does not match number of lr schedulers' + assert all([k in ['lr-Adam', 'lr-Adam-1'] for k in lr_logger.lrs.keys()]), \ + 'Names of learning rates not set correctly' From 97c7b6b314a3916d49bf104e2eefe907ba74eba2 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 1 May 2020 16:41:15 +0200 Subject: [PATCH 08/43] fixing LBFGS test (#1678) * params * drop acc * acc --- tests/base/utils.py | 2 +- tests/models/test_cpu.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/base/utils.py b/tests/base/utils.py index 1f0d582ed6e01..fc10d75bf868b 100644 --- a/tests/base/utils.py +++ b/tests/base/utils.py @@ -127,7 +127,7 @@ def get_default_model(lbfgs=False): hparams = get_default_hparams() if lbfgs: setattr(hparams, 'optimizer_name', 'lbfgs') - setattr(hparams, 'learning_rate', 0.002) + setattr(hparams, 'learning_rate', 0.005) model = LightningTestModel(hparams) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index eb3b28769e206..0c4ca6e42c9e1 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -81,7 +81,8 @@ def test_lbfgs_cpu_model(tmpdir): ) model, hparams = tutils.get_default_model(lbfgs=True) - tutils.run_model_test_without_loggers(trainer_options, model, min_acc=0.5) + # the test is there for the closure not the performance + tutils.run_model_test_without_loggers(trainer_options, model, min_acc=0.) def test_default_logger_callbacks_cpu_model(tmpdir): From 34bc1493596697a2dfc8c76036921b2bb2fb5013 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 1 May 2020 16:43:58 +0200 Subject: [PATCH 09/43] move unnecessary dict trainer_options (#1469) * move unnecessary dict trainer_options * fix tests * fix tests * formatting * missing --- tests/base/utils.py | 2 - tests/callbacks/test_callbacks.py | 21 ++-- tests/loggers/test_base.py | 15 +-- tests/loggers/test_neptune.py | 4 +- tests/loggers/test_trains.py | 7 +- tests/loggers/test_wandb.py | 8 +- tests/models/test_amp.py | 24 ++--- tests/models/test_cpu.py | 37 +++---- tests/models/test_gpu.py | 33 +++---- tests/models/test_restore.py | 40 ++++---- tests/trainer/test_checks.py | 12 +-- tests/trainer/test_dataloaders.py | 159 +++++++++--------------------- tests/trainer/test_optimizers.py | 40 +++----- tests/trainer/test_trainer.py | 58 +++++------ 14 files changed, 160 insertions(+), 300 deletions(-) diff --git a/tests/base/utils.py b/tests/base/utils.py index fc10d75bf868b..42e6d17d224d1 100644 --- a/tests/base/utils.py +++ b/tests/base/utils.py @@ -27,8 +27,6 @@ def assert_speed_parity(pl_times, pt_times, num_epochs): def run_model_test_without_loggers(trainer_options, model, min_acc=0.50): - # save_dir = trainer_options['default_root_dir'] - # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index a082c5ec6f1a6..fcd0836f2545a 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -126,13 +126,13 @@ def on_test_end(self, trainer, pl_module): test_callback = TestCallback() - trainer_options = { - 'callbacks': [test_callback], - 'max_epochs': 1, - 'val_percent_check': 0.1, - 'train_percent_check': 0.2, - 'progress_bar_refresh_rate': 0 - } + trainer_options = dict( + callbacks=[test_callback], + max_epochs=1, + val_percent_check=0.1, + train_percent_check=0.2, + progress_bar_refresh_rate=0, + ) assert not test_callback.on_init_start_called assert not test_callback.on_init_end_called @@ -198,7 +198,7 @@ def on_test_end(self, trainer, pl_module): assert not test_callback.on_test_end_called test_callback = TestCallback() - trainer_options['callbacks'] = [test_callback] + trainer_options.update(callbacks=[test_callback]) trainer = Trainer(**trainer_options) trainer.test(model) @@ -228,14 +228,13 @@ def training_step(self, *args, **kwargs): model = ModelWithoutValStep(hparams) stopping = EarlyStopping(monitor='my_train_metric', min_delta=0.1) - trainer_options = dict( + + trainer = Trainer( default_root_dir=tmpdir, early_stop_callback=stopping, overfit_pct=0.20, max_epochs=5, ) - - trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result == 1, 'training failed to complete' diff --git a/tests/loggers/test_base.py b/tests/loggers/test_base.py index 9bcbc2fb648af..56f6b97b0caa8 100644 --- a/tests/loggers/test_base.py +++ b/tests/loggers/test_base.py @@ -65,14 +65,12 @@ def test_custom_logger(tmpdir): logger = CustomLogger() - trainer_options = dict( + trainer = Trainer( max_epochs=1, train_percent_check=0.05, logger=logger, default_root_dir=tmpdir ) - - trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result == 1, "Training failed" assert logger.hparams_logged == hparams @@ -87,14 +85,12 @@ def test_multiple_loggers(tmpdir): logger1 = CustomLogger() logger2 = CustomLogger() - trainer_options = dict( + trainer = Trainer( max_epochs=1, train_percent_check=0.05, logger=[logger1, logger2], default_root_dir=tmpdir ) - - trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result == 1, "Training failed" @@ -113,9 +109,7 @@ def test_multiple_loggers_pickle(tmpdir): logger1 = CustomLogger() logger2 = CustomLogger() - trainer_options = dict(max_epochs=1, logger=[logger1, logger2]) - - trainer = Trainer(**trainer_options) + trainer = Trainer(max_epochs=1, logger=[logger1, logger2]) pkl_bytes = pickle.dumps(trainer) trainer2 = pickle.loads(pkl_bytes) trainer2.logger.log_metrics({"acc": 1.0}, 0) @@ -148,14 +142,13 @@ def decorated(metrics, step): model, hparams = tutils.get_default_model() model.validation_epoch_end = _validation_epoch_end model.training_epoch_end = _training_epoch_end - trainer_options = dict( + trainer = Trainer( max_epochs=4, default_root_dir=tmpdir, train_percent_check=0.001, val_percent_check=0.01, num_sanity_val_steps=0, ) - trainer = Trainer(**trainer_options) trainer.logger.log_metrics = _log_metrics_decorator( trainer.logger.log_metrics) trainer.fit(model) diff --git a/tests/loggers/test_neptune.py b/tests/loggers/test_neptune.py index 09f531ab8c2a2..4cfdd673478ba 100644 --- a/tests/loggers/test_neptune.py +++ b/tests/loggers/test_neptune.py @@ -68,14 +68,12 @@ def test_neptune_leave_open_experiment_after_fit(tmpdir): def _run_training(logger): logger._experiment = MagicMock() - - trainer_options = dict( + trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, train_percent_check=0.05, logger=logger ) - trainer = Trainer(**trainer_options) trainer.fit(model) return logger diff --git a/tests/loggers/test_trains.py b/tests/loggers/test_trains.py index 3dafb5706505d..e4ee78c65419c 100644 --- a/tests/loggers/test_trains.py +++ b/tests/loggers/test_trains.py @@ -18,13 +18,12 @@ def test_trains_logger(tmpdir): web_host='http://integration.trains.allegro.ai:8080', ) logger = TrainsLogger(project_name="lightning_log", task_name="pytorch lightning test") - trainer_options = dict( + trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, train_percent_check=0.05, logger=logger ) - trainer = Trainer(**trainer_options) result = trainer.fit(model) print('result finished') @@ -44,13 +43,11 @@ def test_trains_pickle(tmpdir): web_host='http://integration.trains.allegro.ai:8080', ) logger = TrainsLogger(project_name="lightning_log", task_name="pytorch lightning test") - trainer_options = dict( + trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, logger=logger ) - - trainer = Trainer(**trainer_options) pkl_bytes = pickle.dumps(trainer) trainer2 = pickle.loads(pkl_bytes) trainer2.logger.log_metrics({"acc": 1.0}) diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py index d2ef6318c8c14..bb2739f95f6e0 100644 --- a/tests/loggers/test_wandb.py +++ b/tests/loggers/test_wandb.py @@ -47,17 +47,15 @@ class Experiment: logger = WandbLogger(id='the_id', offline=True) - trainer_options = dict(max_epochs=1, logger=logger) - - trainer = Trainer(**trainer_options) + trainer = Trainer(max_epochs=1, logger=logger) # Access the experiment to ensure it's created - trainer.logger.experiment + assert trainer.logger.experiment, 'missing experiment' pkl_bytes = pickle.dumps(trainer) trainer2 = pickle.loads(pkl_bytes) assert os.environ['WANDB_MODE'] == 'dryrun' assert trainer2.logger.__class__.__name__ == WandbLogger.__name__ - _ = trainer2.logger.experiment + assert trainer2.logger.experiment, 'missing experiment' wandb.init.assert_called() assert 'id' in wandb.init.call_args[1] diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py index 9b21e711d88eb..81a2325c09897 100644 --- a/tests/models/test_amp.py +++ b/tests/models/test_amp.py @@ -20,7 +20,7 @@ def test_amp_single_gpu(tmpdir, backend): model, hparams = tutils.get_default_model() - trainer_options = dict( + trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, gpus=1, @@ -29,8 +29,6 @@ def test_amp_single_gpu(tmpdir, backend): ) # tutils.run_model_test(trainer_options, model) - - trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result == 1 @@ -74,25 +72,21 @@ def test_amp_gpu_ddp_slurm_managed(tmpdir): hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) - trainer_options = dict( - max_epochs=1, - gpus=[0], - distributed_backend='ddp', - precision=16 - ) - # exp file to get meta logger = tutils.get_default_logger(tmpdir) # exp file to get weights checkpoint = tutils.init_checkpoint_callback(logger) - # add these to the trainer options - trainer_options['checkpoint_callback'] = checkpoint - trainer_options['logger'] = logger - # fit model - trainer = Trainer(**trainer_options) + trainer = Trainer( + max_epochs=1, + gpus=[0], + distributed_backend='ddp', + precision=16, + checkpoint_callback=checkpoint, + logger=logger, + ) trainer.is_slurm_managing_tasks = True result = trainer.fit(model) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 0c4ca6e42c9e1..e7b422dcb22cf 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -120,7 +120,8 @@ def test_running_test_after_fitting(tmpdir): # logger file to get weights checkpoint = tutils.init_checkpoint_callback(logger) - trainer_options = dict( + # fit model + trainer = Trainer( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=8, @@ -130,9 +131,6 @@ def test_running_test_after_fitting(tmpdir): checkpoint_callback=checkpoint, logger=logger ) - - # fit model - trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result == 1, 'training failed to complete' @@ -159,7 +157,8 @@ class CurrentTestModel(LightTrainDataloader, LightTestMixin, TestModelBase): # logger file to get weights checkpoint = tutils.init_checkpoint_callback(logger) - trainer_options = dict( + # fit model + trainer = Trainer( progress_bar_refresh_rate=0, max_epochs=1, train_percent_check=0.4, @@ -169,9 +168,6 @@ class CurrentTestModel(LightTrainDataloader, LightTestMixin, TestModelBase): logger=logger, early_stop_callback=False ) - - # fit model - trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result == 1, 'training failed to complete' @@ -238,16 +234,13 @@ def test_simple_cpu(tmpdir): hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) - # logger file to get meta - trainer_options = dict( + # fit model + trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, val_percent_check=0.1, train_percent_check=0.1, ) - - # fit model - trainer = Trainer(**trainer_options) result = trainer.fit(model) # traning complete @@ -340,15 +333,6 @@ def train_dataloader(self): sampler=None, ) - trainer_options = dict( - default_root_dir=tmpdir, - max_epochs=1, - truncated_bptt_steps=truncated_bptt_steps, - val_percent_check=0, - weights_summary=None, - early_stop_callback=False - ) - hparams = tutils.get_default_hparams() hparams.batch_size = batch_size hparams.in_features = truncated_bptt_steps @@ -358,7 +342,14 @@ def train_dataloader(self): model = BpttTestModel(hparams) # fit model - trainer = Trainer(**trainer_options) + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + truncated_bptt_steps=truncated_bptt_steps, + val_percent_check=0, + weights_summary=None, + early_stop_callback=False + ) result = trainer.fit(model) assert result == 1, 'training failed to complete' diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 69580fdd1f8eb..dcd90b08ce911 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -50,19 +50,19 @@ def test_ddp_all_dataloaders_passed_to_fit(tmpdir): tutils.set_random_master_port() model, hparams = tutils.get_default_model() - trainer_options = dict(default_root_dir=tmpdir, - progress_bar_refresh_rate=0, - max_epochs=1, - train_percent_check=0.4, - val_percent_check=0.2, - gpus=[0, 1], - distributed_backend='ddp') - fit_options = dict(train_dataloader=model.train_dataloader(), - val_dataloaders=model.val_dataloader()) - - trainer = Trainer(**trainer_options) - result = trainer.fit(model, **fit_options) + trainer = Trainer( + default_root_dir=tmpdir, + progress_bar_refresh_rate=0, + max_epochs=1, + train_percent_check=0.4, + val_percent_check=0.2, + gpus=[0, 1], + distributed_backend='ddp' + ) + result = trainer.fit(model, + train_dataloader=model.train_dataloader(), + val_dataloaders=model.val_dataloader()) assert result == 1, "DDP doesn't work with dataloaders passed to fit()." @@ -77,14 +77,12 @@ def test_cpu_slurm_save_load(tmpdir): logger = tutils.get_default_logger(tmpdir) version = logger.version - trainer_options = dict( + # fit model + trainer = Trainer( max_epochs=1, logger=logger, checkpoint_callback=ModelCheckpoint(tmpdir) ) - - # fit model - trainer = Trainer(**trainer_options) result = trainer.fit(model) real_global_step = trainer.global_step @@ -115,12 +113,11 @@ def test_cpu_slurm_save_load(tmpdir): # new logger file to get meta logger = tutils.get_default_logger(tmpdir, version=version) - trainer_options = dict( + trainer = Trainer( max_epochs=1, logger=logger, checkpoint_callback=ModelCheckpoint(tmpdir), ) - trainer = Trainer(**trainer_options) model = LightningTestModel(hparams) # set the epoch start hook so we can predict before the model does the full training diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py index 1165da6f786e1..0921a3a871f10 100644 --- a/tests/models/test_restore.py +++ b/tests/models/test_restore.py @@ -163,12 +163,6 @@ def test_dp_resume(tmpdir): hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) - trainer_options = dict( - max_epochs=1, - gpus=2, - distributed_backend='dp', - ) - # get logger logger = tutils.get_default_logger(tmpdir) @@ -176,9 +170,13 @@ def test_dp_resume(tmpdir): # logger file to get weights checkpoint = tutils.init_checkpoint_callback(logger) - # add these to the trainer options - trainer_options['logger'] = logger - trainer_options['checkpoint_callback'] = checkpoint + trainer_options = dict( + max_epochs=1, + gpus=2, + distributed_backend='dp', + logger=logger, + checkpoint_callback=checkpoint, + ) # fit model trainer = Trainer(**trainer_options) @@ -199,11 +197,13 @@ def test_dp_resume(tmpdir): # init new trainer new_logger = tutils.get_default_logger(tmpdir, version=logger.version) - trainer_options['logger'] = new_logger - trainer_options['checkpoint_callback'] = ModelCheckpoint(tmpdir) - trainer_options['train_percent_check'] = 0.5 - trainer_options['val_percent_check'] = 0.2 - trainer_options['max_epochs'] = 1 + trainer_options.update( + logger=new_logger, + checkpoint_callback=ModelCheckpoint(tmpdir), + train_percent_check=0.5, + val_percent_check=0.2, + max_epochs=1, + ) new_trainer = Trainer(**trainer_options) # set the epoch start hook so we can predict before the model does the full training @@ -240,14 +240,12 @@ def test_model_saving_loading(tmpdir): # logger file to get meta logger = tutils.get_default_logger(tmpdir) - trainer_options = dict( + # fit model + trainer = Trainer( max_epochs=1, logger=logger, checkpoint_callback=ModelCheckpoint(tmpdir) ) - - # fit model - trainer = Trainer(**trainer_options) result = trainer.fit(model) # traning complete @@ -289,7 +287,8 @@ def test_model_saving_loading(tmpdir): def test_load_model_with_missing_hparams(tmpdir): - trainer_options = dict( + # fit model + trainer = Trainer( progress_bar_refresh_rate=0, max_epochs=1, checkpoint_callback=ModelCheckpoint(tmpdir, save_top_k=-1), @@ -297,9 +296,6 @@ def test_load_model_with_missing_hparams(tmpdir): default_root_dir=tmpdir, ) - # fit model - trainer = Trainer(**trainer_options) - model = LightningTestModelWithoutHyperparametersArg() trainer.fit(model) last_checkpoint = sorted(glob.glob(os.path.join(trainer.checkpoint_callback.dirpath, "*.ckpt")))[-1] diff --git a/tests/trainer/test_checks.py b/tests/trainer/test_checks.py index 2ee8037758d76..d69ec1e6e243a 100755 --- a/tests/trainer/test_checks.py +++ b/tests/trainer/test_checks.py @@ -21,8 +21,7 @@ class CurrentTestModel(LightningModule): def forward(self, x): pass - trainer_options = dict(default_root_dir=tmpdir, max_epochs=1) - trainer = Trainer(**trainer_options) + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1) with pytest.raises(MisconfigurationException): model = CurrentTestModel() @@ -37,8 +36,7 @@ def test_error_on_no_train_dataloader(tmpdir): class CurrentTestModel(TestModelBase): pass - trainer_options = dict(default_root_dir=tmpdir, max_epochs=1) - trainer = Trainer(**trainer_options) + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1) with pytest.raises(MisconfigurationException): model = CurrentTestModel(hparams) @@ -56,8 +54,7 @@ def forward(self, x): def training_step(self, batch, batch_idx, optimizer_idx=None): pass - trainer_options = dict(default_root_dir=tmpdir, max_epochs=1) - trainer = Trainer(**trainer_options) + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1) with pytest.raises(MisconfigurationException): model = CurrentTestModel() @@ -74,8 +71,7 @@ def test_warning_on_wrong_validation_settings(tmpdir): tutils.reset_seed() hparams = tutils.get_default_hparams() - trainer_options = dict(default_root_dir=tmpdir, max_epochs=1) - trainer = Trainer(**trainer_options) + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1) class CurrentTestModel(LightTrainDataloader, LightValidationDataloader, diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index d52e61eab9047..83ff481d694f9 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -24,7 +24,13 @@ ) -def test_dataloader_config_errors(tmpdir): +@pytest.mark.parametrize("dataloader_options", [ + dict(train_percent_check=-0.1), + dict(train_percent_check=1.1), + dict(val_check_interval=1.1), + dict(val_check_interval=10000), +]) +def test_dataloader_config_errors(tmpdir, dataloader_options): tutils.reset_seed() class CurrentTestModel( @@ -36,63 +42,13 @@ class CurrentTestModel( hparams = tutils.get_default_hparams() model = CurrentTestModel(hparams) - # percent check < 0 - - # logger file to get meta - trainer_options = dict( - default_root_dir=tmpdir, - max_epochs=1, - train_percent_check=-0.1, - ) - - # fit model - trainer = Trainer(**trainer_options) - - with pytest.raises(ValueError): - trainer.fit(model) - - # percent check > 1 - - # logger file to get meta - trainer_options = dict( - default_root_dir=tmpdir, - max_epochs=1, - train_percent_check=1.1, - ) - - # fit model - trainer = Trainer(**trainer_options) - - with pytest.raises(ValueError): - trainer.fit(model) - - # int val_check_interval > num batches - - # logger file to get meta - trainer_options = dict( - default_root_dir=tmpdir, - max_epochs=1, - val_check_interval=10000 - ) - # fit model - trainer = Trainer(**trainer_options) - - with pytest.raises(ValueError): - trainer.fit(model) - - # float val_check_interval > 1 - - # logger file to get meta - trainer_options = dict( + trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, - val_check_interval=1.1 + **dataloader_options, ) - # fit model - trainer = Trainer(**trainer_options) - with pytest.raises(ValueError): trainer.fit(model) @@ -111,16 +67,13 @@ class CurrentTestModel( hparams = tutils.get_default_hparams() model = CurrentTestModel(hparams) - # logger file to get meta - trainer_options = dict( + # fit model + trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, val_percent_check=0.1, train_percent_check=1.0, ) - - # fit model - trainer = Trainer(**trainer_options) result = trainer.fit(model) # verify training completed @@ -150,16 +103,13 @@ class CurrentTestModel( hparams = tutils.get_default_hparams() model = CurrentTestModel(hparams) - # logger file to get meta - trainer_options = dict( + # fit model + trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, val_percent_check=0.1, train_percent_check=0.2 ) - - # fit model - trainer = Trainer(**trainer_options) trainer.fit(model) trainer.test() @@ -184,19 +134,15 @@ class CurrentTestModel(LightTrainDataloader, TestModelBase): hparams = tutils.get_default_hparams() - # logger file to get meta - trainer_options = dict( + # only train passed to fit + model = CurrentTestModel(hparams) + trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, val_percent_check=0.1, train_percent_check=0.2 ) - - # only train passed to fit - model = CurrentTestModel(hparams) - trainer = Trainer(**trainer_options) - fit_options = dict(train_dataloader=model._dataloader(train=True)) - result = trainer.fit(model, **fit_options) + result = trainer.fit(model, train_dataloader=model._dataloader(train=True)) assert result == 1 @@ -214,21 +160,17 @@ class CurrentTestModel( hparams = tutils.get_default_hparams() - # logger file to get meta - trainer_options = dict( + # train, val passed to fit + model = CurrentTestModel(hparams) + trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, val_percent_check=0.1, train_percent_check=0.2 ) - - # train, val passed to fit - model = CurrentTestModel(hparams) - trainer = Trainer(**trainer_options) - fit_options = dict(train_dataloader=model._dataloader(train=True), - val_dataloaders=model._dataloader(train=False)) - - result = trainer.fit(model, **fit_options) + result = trainer.fit(model, + train_dataloader=model._dataloader(train=True), + val_dataloaders=model._dataloader(train=False)) assert result == 1 assert len(trainer.val_dataloaders) == 1, \ f'`val_dataloaders` not initiated properly, got {trainer.val_dataloaders}' @@ -249,24 +191,20 @@ class CurrentTestModel( hparams = tutils.get_default_hparams() - # logger file to get meta - trainer_options = dict( + # train, val and test passed to fit + model = CurrentTestModel(hparams) + trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, val_percent_check=0.1, train_percent_check=0.2 ) - # train, val and test passed to fit - model = CurrentTestModel(hparams) - trainer = Trainer(**trainer_options) - fit_options = dict(train_dataloader=model._dataloader(train=True), - val_dataloaders=model._dataloader(train=False)) - test_options = dict(test_dataloaders=model._dataloader(train=False)) - - result = trainer.fit(model, **fit_options) + result = trainer.fit(model, + train_dataloader=model._dataloader(train=True), + val_dataloaders=model._dataloader(train=False)) - trainer.test(**test_options) + trainer.test(test_dataloaders=model._dataloader(train=False)) assert result == 1 assert len(trainer.val_dataloaders) == 1, \ @@ -288,25 +226,23 @@ class CurrentTestModel( hparams = tutils.get_default_hparams() - # logger file to get meta - trainer_options = dict( + # train, multiple val and multiple test passed to fit + model = CurrentTestModel(hparams) + trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, val_percent_check=0.1, train_percent_check=0.2 ) - # train, multiple val and multiple test passed to fit - model = CurrentTestModel(hparams) - trainer = Trainer(**trainer_options) - fit_options = dict(train_dataloader=model._dataloader(train=True), - val_dataloaders=[model._dataloader(train=False), - model._dataloader(train=False)]) - test_options = dict(test_dataloaders=[model._dataloader(train=False), - model._dataloader(train=False)]) + results = trainer.fit( + model, + train_dataloader=model._dataloader(train=True), + val_dataloaders=[model._dataloader(train=False), model._dataloader(train=False)], + ) + assert results - results = trainer.fit(model, **fit_options) - trainer.test(**test_options) + trainer.test(test_dataloaders=[model._dataloader(train=False), model._dataloader(train=False)]) assert len(trainer.val_dataloaders) == 2, \ f'Multiple `val_dataloaders` not initiated properly, got {trainer.val_dataloaders}' @@ -329,7 +265,6 @@ class CurrentTestModel( hparams = tutils.get_default_hparams() model = CurrentTestModel(hparams) - # logger file to get meta trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, @@ -341,6 +276,7 @@ class CurrentTestModel( trainer = Trainer(**trainer_options) fit_options = dict(val_dataloaders=model._dataloader(train=False)) results = trainer.fit(model, **fit_options) + assert results # fit model trainer = Trainer(**trainer_options) @@ -506,20 +442,17 @@ class CurrentTestModel( hparams = tutils.get_default_hparams() model = CurrentTestModel(hparams) - # logger file to get meta - trainer_options = dict( + fit_options = dict(train_dataloader=model._dataloader(train=True), + val_dataloaders=model._dataloader(train=False)) + test_options = dict(test_dataloaders=model._dataloader(train=False)) + + trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, val_percent_check=0.1, train_percent_check=0.2 ) - fit_options = dict(train_dataloader=model._dataloader(train=True), - val_dataloaders=model._dataloader(train=False)) - test_options = dict(test_dataloaders=model._dataloader(train=False)) - - trainer = Trainer(**trainer_options) - # fit model with pytest.warns(UserWarning, match='train'): trainer.fit(model, **fit_options) diff --git a/tests/trainer/test_optimizers.py b/tests/trainer/test_optimizers.py index 6ac9da23d8368..b445dcb2f7173 100644 --- a/tests/trainer/test_optimizers.py +++ b/tests/trainer/test_optimizers.py @@ -29,16 +29,13 @@ class CurrentTestModel( hparams = tutils.get_default_hparams() model = CurrentTestModel(hparams) - # logger file to get meta - trainer_options = dict( + # fit model + trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, val_percent_check=0.1, train_percent_check=0.2 ) - - # fit model - trainer = Trainer(**trainer_options) results = trainer.fit(model) init_lr = hparams.learning_rate @@ -68,16 +65,13 @@ class CurrentTestModel( hparams = tutils.get_default_hparams() model = CurrentTestModel(hparams) - # logger file to get meta - trainer_options = dict( + # fit model + trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, val_percent_check=0.1, train_percent_check=0.2 ) - - # fit model - trainer = Trainer(**trainer_options) results = trainer.fit(model) init_lr = hparams.learning_rate @@ -111,16 +105,13 @@ class CurrentTestModel( hparams = tutils.get_default_hparams() model = CurrentTestModel(hparams) - # logger file to get meta - trainer_options = dict( + # fit model + trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, val_percent_check=0.1, train_percent_check=0.2 ) - - # fit model - trainer = Trainer(**trainer_options) results = trainer.fit(model) init_lr = hparams.learning_rate @@ -160,17 +151,15 @@ class CurrentTestModel( hparams = tutils.get_default_hparams() model = CurrentTestModel(hparams) - # logger file to get meta - trainer_options = dict( + # fit model + trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, val_percent_check=0.1, train_percent_check=0.2 ) - - # fit model - trainer = Trainer(**trainer_options) results = trainer.fit(model) + assert results assert trainer.lr_schedulers[0] == \ dict(scheduler=trainer.lr_schedulers[0]['scheduler'], monitor='val_loss', @@ -260,16 +249,13 @@ class CurrentTestModel( hparams = tutils.get_default_hparams() model = CurrentTestModel(hparams) - # logger file to get meta - trainer_options = dict( + # fit model + trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, val_percent_check=0.1, train_percent_check=0.2 ) - - # fit model - trainer = Trainer(**trainer_options) result = trainer.fit(model) # verify training completed @@ -291,9 +277,7 @@ def configure_optimizers(self): hparams = tutils.get_default_hparams() model = CurrentTestModel(hparams) - trainer_options = dict(default_save_path=tmpdir, max_epochs=1) - # fit model - trainer = Trainer(**trainer_options) + trainer = Trainer(default_save_path=tmpdir, max_epochs=1) result = trainer.fit(model) assert result == 1 diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index cb650fd87e4c4..b7344a70b3e40 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -36,16 +36,12 @@ def test_model_pickle(tmpdir): def test_hparams_save_load(tmpdir): model = DictHparamsModel({'in_features': 28 * 28, 'out_features': 10, 'failed_key': lambda x: x}) - # logger file to get meta - trainer_options = dict( + trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, ) - # fit model - trainer = Trainer(**trainer_options) result = trainer.fit(model) - assert result == 1 # try to load the model now @@ -69,16 +65,13 @@ class CurrentTestModel(LightTrainDataloader, TestModelBase): # logger file to get meta logger = tutils.get_default_logger(tmpdir) - trainer_options = dict( + trainer = Trainer( max_epochs=1, logger=logger, checkpoint_callback=ModelCheckpoint(tmpdir) ) - # fit model - trainer = Trainer(**trainer_options) result = trainer.fit(model) - # training complete assert result == 1, 'amp + ddp model failed to complete' @@ -110,14 +103,12 @@ class CurrentTestModel(LightTrainDataloader, LightValidationStepMixin, TestModel # logger file to get meta logger = tutils.get_default_logger(tmpdir) - trainer_options = dict( + # fit model + trainer = Trainer( max_epochs=1, logger=logger, checkpoint_callback=ModelCheckpoint(tmpdir) ) - - # fit model - trainer = Trainer(**trainer_options) result = trainer.fit(model) # traning complete @@ -353,8 +344,8 @@ def increment_on_load_checkpoint(self, _): val_check_interval=1., ) - # fit model trainer = Trainer(**trainer_options) + # fit model trainer.fit(model) training_batches = trainer.num_training_batches @@ -399,11 +390,11 @@ def test_trainer_max_steps_and_epochs(tmpdir): model, trainer_options, num_train_samples = _init_steps_model() # define less train steps than epochs - trainer_options.update(dict( + trainer_options.update( default_root_dir=tmpdir, max_epochs=3, max_steps=num_train_samples + 10 - )) + ) # fit model trainer = Trainer(**trainer_options) @@ -414,10 +405,10 @@ def test_trainer_max_steps_and_epochs(tmpdir): assert trainer.global_step == trainer.max_steps, "Model did not stop at max_steps" # define less train epochs than steps - trainer_options.update(dict( + trainer_options.update( max_epochs=2, max_steps=trainer_options['max_epochs'] * 2 * num_train_samples - )) + ) # fit model trainer = Trainer(**trainer_options) @@ -434,13 +425,13 @@ def test_trainer_min_steps_and_epochs(tmpdir): model, trainer_options, num_train_samples = _init_steps_model() # define callback for stopping the model and default epochs - trainer_options.update(dict( + trainer_options.update( default_root_dir=tmpdir, early_stop_callback=EarlyStopping(monitor='val_loss', min_delta=1.0), val_check_interval=2, min_epochs=1, max_epochs=5 - )) + ) # define less min steps than 1 epoch trainer_options['min_steps'] = math.floor(num_train_samples / 2) @@ -484,15 +475,12 @@ class CurrentTestModel( # verify torch.backends.cudnn.benchmark is not turned on assert not torch.backends.cudnn.benchmark - # logger file to get meta - trainer_options = dict( + # fit model + trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, benchmark=True, ) - - # fit model - trainer = Trainer(**trainer_options) result = trainer.fit(model) # verify training completed @@ -660,17 +648,15 @@ def on_batch_start(self, trainer, pl_module): interrupt_callback = InterruptCallback() - trainer_options = { - 'callbacks': [interrupt_callback], - 'max_epochs': 1, - 'val_percent_check': 0.1, - 'train_percent_check': 0.2, - 'progress_bar_refresh_rate': 0, - 'logger': False, - 'default_root_dir': tmpdir, - } - - trainer = Trainer(**trainer_options) + trainer = Trainer( + callbacks=[interrupt_callback], + max_epochs=1, + val_percent_check=0.1, + train_percent_check=0.2, + progress_bar_refresh_rate=0, + logger=False, + default_root_dir=tmpdir, + ) assert not trainer.interrupted trainer.fit(model) assert trainer.interrupted From 2950f669834506f8e5845b318b0f25d52d19e331 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Fri, 1 May 2020 11:13:35 -0700 Subject: [PATCH 10/43] Fix Horovod distributed backend to set the root_gpu property (#1669) * params * drop acc * Fix Horovod distributed backend to set the root_gpu * Fixed test * Fixed tests * Fixed lint * Set root_gpu during initialization * chlog Co-authored-by: Jirka --- CHANGELOG.md | 4 +++- .../trainer/distrib_data_parallel.py | 16 ++++++++++++---- pytorch_lightning/trainer/distrib_parts.py | 8 +++----- tests/models/data/horovod/train_default_model.py | 8 +++++++- tests/models/test_horovod.py | 7 +++++-- 5 files changed, 30 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f67e85a452ff9..94675a20111c0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,10 +18,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- Fixed broken link in PR template ([#1675](https://github.com/PyTorchLightning/pytorch-lightning/pull/1675)) - Fixed ModelCheckpoint not None checking filepath ([1654](https://github.com/PyTorchLightning/pytorch-lightning/pull/1654)) + - Trainer now calls `on_load_checkpoint()` when resuming from a checkpoint ([1666](https://github.com/PyTorchLightning/pytorch-lightning/pull/1666)) +- Fixed Horovod distributed backend to set the `root_gpu` property ([#1669](https://github.com/PyTorchLightning/pytorch-lightning/pull/1669)) + ## [0.7.5] - 2020-04-27 diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 56c7bae8ec6a7..8651dd5c1b5a0 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -194,8 +194,7 @@ def set_distributed_mode(self, distributed_backend): if distributed_backend is None: if self.has_horovodrun(): - self.check_horovod() - self.use_horovod = True + self._set_horovod_backend() elif self.num_gpus == 0: if self.num_nodes > 1 or self.num_processes > 1: self.use_ddp = True # ddp_cpu @@ -235,8 +234,7 @@ def set_distributed_mode(self, distributed_backend): self.data_parallel_device_ids = None self.on_gpu = False elif distributed_backend == 'horovod': - self.check_horovod() - self.use_horovod = True + self._set_horovod_backend() # throw error to force user ddp or ddp2 choice if self.num_nodes > 1 and not (self.use_ddp2 or self.use_ddp): @@ -421,6 +419,16 @@ def resolve_root_node_address(self, root_node): return root_node + def _set_horovod_backend(self): + self.check_horovod() + self.use_horovod = True + + # Initialize Horovod to get rank / size info + hvd.init() + if self.on_gpu: + # Horovod assigns one local GPU per process + self.root_gpu = hvd.local_rank() + def check_horovod(self): """Raises a `MisconfigurationException` if the Trainer is not configured correctly for Horovod.""" if not HOROVOD_AVAILABLE: diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index db4e132c0b445..a9f4b6114522e 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -570,13 +570,11 @@ def dp_train(self, model): model.forward = model_autocast_original_forward def horovod_train(self, model): - # Horovod: initialize library - hvd.init() - if torch.cuda.is_available() and self.on_gpu: # Horovod: pin GPU to local rank - torch.cuda.set_device(hvd.local_rank()) - model.cuda(hvd.local_rank()) + assert self.root_gpu == hvd.local_rank() + torch.cuda.set_device(self.root_gpu) + model.cuda(self.root_gpu) # Only show progress bar from the first worker self.progress_bar_refresh_rate = self.progress_bar_refresh_rate if hvd.rank() == 0 else 0 diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py index 6c11e2ca5e755..3410cdc1d5051 100644 --- a/tests/models/data/horovod/train_default_model.py +++ b/tests/models/data/horovod/train_default_model.py @@ -27,12 +27,14 @@ PATH_ROOT = os.path.join(PATH_HERE, '..', '..', '..', '..') sys.path.insert(0, os.path.abspath(PATH_ROOT)) +from pytorch_lightning import Trainer # noqa: E402 from pytorch_lightning.callbacks import ModelCheckpoint # noqa: E402 import tests.base.utils as tutils # noqa: E402 parser = argparse.ArgumentParser() parser.add_argument('--trainer-options', required=True) +parser.add_argument('--on-gpu', action='store_true', default=False) def run_test_from_config(trainer_options): @@ -44,11 +46,15 @@ def run_test_from_config(trainer_options): trainer_options['checkpoint_callback'] = ModelCheckpoint(ckpt_path) model, hparams = tutils.get_default_model() - tutils.run_model_test(trainer_options, model, version=0, with_hpc=False) + tutils.run_model_test(trainer_options, model, on_gpu=args.on_gpu, version=0, with_hpc=False) # Horovod should be initialized following training. If not, this will raise an exception. assert hvd.size() == 2 + if args.on_gpu: + # Test the root_gpu property + assert Trainer(gpus=1, distributed_backend='horovod', max_epochs=1).root_gpu == hvd.local_rank() + if __name__ == "__main__": args = parser.parse_args() diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index c4bcb4b81b995..21a90c191579b 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -38,10 +38,12 @@ def _nccl_available(): return False -def _run_horovod(trainer_options): +def _run_horovod(trainer_options, on_gpu=False): """Execute the training script across multiple workers in parallel.""" cmdline = ['horovodrun', '-np', '2', sys.executable, TEST_SCRIPT, '--trainer-options', shlex.quote(json.dumps(trainer_options))] + if on_gpu: + cmdline += ['--on-gpu'] exit_code = subprocess.call(' '.join(cmdline), shell=True, env=os.environ.copy()) assert exit_code == 0 @@ -93,7 +95,7 @@ def test_horovod_multi_gpu(tmpdir): gpus=1, distributed_backend='horovod' ) - _run_horovod(trainer_options) + _run_horovod(trainer_options, on_gpu=True) @pytest.mark.skipif(sys.version_info >= (3, 8), reason="Horovod not yet supported in Python 3.8") @@ -159,5 +161,6 @@ def get_model_params(model): def get_optimizer_params(optimizer): return set([p for group in optimizer.param_groups for p in group.get('params', [])]) + assert get_model_params(model.generator) != get_model_params(model.discriminator) assert get_model_params(model.generator) == get_optimizer_params(trainer.optimizers[0]) assert get_model_params(model.discriminator) == get_optimizer_params(trainer.optimizers[1]) From b4b73f92dd86f67166d37451a40c238cfbae8f48 Mon Sep 17 00:00:00 2001 From: Fedor Korotkov Date: Sat, 2 May 2020 06:47:31 -0400 Subject: [PATCH 11/43] Trigger automatic rebase on issue comment (#1695) * Trigger automatic rebase on issue comment Instead of `pull_request` event (created, closed, etc.). Fixes https://github.com/cirrus-actions/rebase/issues/43 * Removed workaround --- .github/workflows/rebase.yml | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/.github/workflows/rebase.yml b/.github/workflows/rebase.yml index 2aa94bea05611..06d20652c6b5a 100644 --- a/.github/workflows/rebase.yml +++ b/.github/workflows/rebase.yml @@ -1,8 +1,9 @@ name: Automatic Rebase # https://github.com/marketplace/actions/automatic-rebase -on: - - pull_request +on: + issue_comment: + types: [created] jobs: rebase: @@ -17,10 +18,3 @@ jobs: uses: cirrus-actions/rebase@1.2 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # https://github.community/t5/GitHub-Actions/Workflow-is-failing-if-no-job-can-be-ran-due-to-condition/m-p/38186#M3250 - always_job: - name: Always run job - runs-on: ubuntu-latest - steps: - - name: Always run - run: echo "This job is used to prevent the workflow to fail when all other jobs are skipped." From f3800279512cd90ac29c3b0a9f5b859a4ba8f70a Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Sat, 2 May 2020 14:38:22 +0200 Subject: [PATCH 12/43] refactor default model (#1652) * refactor default model * drop redundant seeds * formatting * path * formatting * rename --- tests/base/eval_model_optimizers.py | 12 ++--- tests/base/eval_model_template.py | 2 +- tests/base/eval_model_test_steps.py | 2 +- tests/base/eval_model_valid_steps.py | 2 +- tests/base/utils.py | 20 ++------ tests/callbacks/test_callbacks.py | 7 +-- tests/loggers/test_all.py | 7 +-- tests/loggers/test_base.py | 4 +- tests/loggers/test_neptune.py | 5 +- tests/loggers/test_trains.py | 4 -- tests/loggers/test_wandb.py | 4 -- .../data/horovod/train_default_model.py | 12 ++--- tests/models/test_amp.py | 16 ++---- tests/models/test_cpu.py | 50 +++++-------------- tests/models/test_gpu.py | 40 ++++++--------- tests/models/test_horovod.py | 8 ++- tests/models/test_restore.py | 11 ---- tests/trainer/test_checks.py | 5 -- tests/trainer/test_dataloaders.py | 12 ----- tests/trainer/test_lr_finder.py | 12 ++--- tests/trainer/test_optimizers.py | 13 ++--- tests/trainer/test_trainer.py | 22 ++------ tests/trainer/test_trainer_cli.py | 1 - 23 files changed, 77 insertions(+), 194 deletions(-) diff --git a/tests/base/eval_model_optimizers.py b/tests/base/eval_model_optimizers.py index 1666e26ed81aa..bcce319d4a565 100644 --- a/tests/base/eval_model_optimizers.py +++ b/tests/base/eval_model_optimizers.py @@ -15,7 +15,7 @@ def configure_optimizers(self): def configure_optimizers_empty(self): return None - def configure_optimizers_lbfgs(self): + def configure_optimizers__lbfgs(self): """ return whatever optimizers we want here. :return: list of optimizers @@ -23,7 +23,7 @@ def configure_optimizers_lbfgs(self): optimizer = optim.LBFGS(self.parameters(), lr=self.hparams.learning_rate) return optimizer - def configure_optimizers_multiple_optimizers(self): + def configure_optimizers__multiple_optimizers(self): """ return whatever optimizers we want here. :return: list of optimizers @@ -33,12 +33,12 @@ def configure_optimizers_multiple_optimizers(self): optimizer2 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) return optimizer1, optimizer2 - def configure_optimizers_single_scheduler(self): + def configure_optimizers__single_scheduler(self): optimizer = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.1) return [optimizer], [lr_scheduler] - def configure_optimizers_multiple_schedulers(self): + def configure_optimizers__multiple_schedulers(self): optimizer1 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) optimizer2 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) lr_scheduler1 = optim.lr_scheduler.StepLR(optimizer1, 1, gamma=0.1) @@ -46,7 +46,7 @@ def configure_optimizers_multiple_schedulers(self): return [optimizer1, optimizer2], [lr_scheduler1, lr_scheduler2] - def configure_optimizers_mixed_scheduling(self): + def configure_optimizers__mixed_scheduling(self): optimizer1 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) optimizer2 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) lr_scheduler1 = optim.lr_scheduler.StepLR(optimizer1, 4, gamma=0.1) @@ -55,7 +55,7 @@ def configure_optimizers_mixed_scheduling(self): return [optimizer1, optimizer2], \ [{'scheduler': lr_scheduler1, 'interval': 'step'}, lr_scheduler2] - def configure_optimizers_reduce_lr_on_plateau(self): + def configure_optimizers__reduce_lr_on_plateau(self): optimizer = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer) return [optimizer], [lr_scheduler] diff --git a/tests/base/eval_model_template.py b/tests/base/eval_model_template.py index 77d83b483bcb9..37f4dfbd04144 100644 --- a/tests/base/eval_model_template.py +++ b/tests/base/eval_model_template.py @@ -33,7 +33,7 @@ class EvalModelTemplate( """ This template houses all combinations of model configurations we want to test """ - def __init__(self, hparams): + def __init__(self, hparams: object) -> object: """Pass in parsed HyperOptArgumentParser to the model.""" # init superclass super().__init__() diff --git a/tests/base/eval_model_test_steps.py b/tests/base/eval_model_test_steps.py index ed8fe75cd3777..b4c80cff06421 100644 --- a/tests/base/eval_model_test_steps.py +++ b/tests/base/eval_model_test_steps.py @@ -45,7 +45,7 @@ def test_step(self, batch, batch_idx, *args, **kwargs): }) return output - def test_step_multiple_dataloaders(self, batch, batch_idx, dataloader_idx, **kwargs): + def test_step__multiple_dataloaders(self, batch, batch_idx, dataloader_idx, **kwargs): """ Default, baseline test_step :param batch: diff --git a/tests/base/eval_model_valid_steps.py b/tests/base/eval_model_valid_steps.py index 1f40b45f80434..d6c9a84792054 100644 --- a/tests/base/eval_model_valid_steps.py +++ b/tests/base/eval_model_valid_steps.py @@ -51,7 +51,7 @@ def validation_step(self, batch, batch_idx, *args, **kwargs): }) return output - def validation_step_multiple_dataloaders(self, batch, batch_idx, dataloader_idx, **kwargs): + def validation_step__multiple_dataloaders(self, batch, batch_idx, dataloader_idx, **kwargs): """ Lightning calls this inside the validation loop :param batch: diff --git a/tests/base/utils.py b/tests/base/utils.py index 42e6d17d224d1..f27d0bbdcb39c 100644 --- a/tests/base/utils.py +++ b/tests/base/utils.py @@ -9,7 +9,7 @@ from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.loggers import TensorBoardLogger from tests import TEMP_PATH, RANDOM_PORTS, RANDOM_SEEDS -from tests.base import LightningTestModel +from tests.base import LightningTestModel, EvalModelTemplate from tests.base.datasets import PATH_DATASETS @@ -27,6 +27,8 @@ def assert_speed_parity(pl_times, pt_times, num_epochs): def run_model_test_without_loggers(trainer_options, model, min_acc=0.50): + reset_seed() + # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) @@ -54,6 +56,7 @@ def run_model_test_without_loggers(trainer_options, model, min_acc=0.50): def run_model_test(trainer_options, model, on_gpu=True, version=None, with_hpc=True): + reset_seed() save_dir = trainer_options['default_root_dir'] # logger file to get meta @@ -95,8 +98,6 @@ def run_model_test(trainer_options, model, on_gpu=True, version=None, with_hpc=T def get_default_hparams(continue_training=False, hpc_exp_number=0): - _ = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) - args = { 'drop_prob': 0.2, 'batch_size': 32, @@ -120,18 +121,6 @@ def get_default_hparams(continue_training=False, hpc_exp_number=0): return hparams -def get_default_model(lbfgs=False): - # set up model with these hyperparams - hparams = get_default_hparams() - if lbfgs: - setattr(hparams, 'optimizer_name', 'lbfgs') - setattr(hparams, 'learning_rate', 0.005) - - model = LightningTestModel(hparams) - - return model, hparams - - def get_default_logger(save_dir, version=None): # set up logger object without actually saving logs logger = TensorBoardLogger(save_dir, name='lightning_logs', version=version) @@ -229,6 +218,7 @@ def reset_seed(): def set_random_master_port(): + reset_seed() port = RANDOM_PORTS.pop() os.environ['MASTER_PORT'] = str(port) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index fcd0836f2545a..2bbcfaea1f191 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -214,8 +214,6 @@ def on_test_end(self, trainer, pl_module): def test_early_stopping_no_val_step(tmpdir): """Test that early stopping callback falls back to training metrics when no validation defined.""" - tutils.reset_seed() - class ModelWithoutValStep(LightTrainDataloader, TestModelBase): def training_step(self, *args, **kwargs): @@ -224,8 +222,7 @@ def training_step(self, *args, **kwargs): output.update({'my_train_metric': loss}) return output - hparams = tutils.get_default_hparams() - model = ModelWithoutValStep(hparams) + model = ModelWithoutValStep(tutils.get_default_hparams()) stopping = EarlyStopping(monitor='my_train_metric', min_delta=0.1) @@ -269,7 +266,7 @@ class CurrentTestModel(LightTrainDataloader, TestModelBase): overfit_pct=0.20, max_epochs=5 ) - result = trainer.fit(model) + trainer.fit(model) # These should be different if the dirpath has be overridden assert trainer.ckpt_path != trainer.default_root_dir diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py index 383ca263c7709..06e93fa6a23f4 100644 --- a/tests/loggers/test_all.py +++ b/tests/loggers/test_all.py @@ -7,6 +7,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.loggers import ( TensorBoardLogger, MLFlowLogger, NeptuneLogger, TestTubeLogger, CometLogger) +from tests.base import EvalModelTemplate def _get_logger_args(logger_class, save_dir): @@ -29,14 +30,12 @@ def _get_logger_args(logger_class, save_dir): ]) def test_loggers_fit_test(tmpdir, monkeypatch, logger_class): """Verify that basic functionality of all loggers.""" - tutils.reset_seed() - # prevent comet logger from trying to print at exit, since # pytest's stdout/stderr redirection breaks it import atexit monkeypatch.setattr(atexit, 'register', lambda _: None) - model, _ = tutils.get_default_model() + model = EvalModelTemplate(tutils.get_default_hparams()) class StoreHistoryLogger(logger_class): def __init__(self, *args, **kwargs): @@ -78,8 +77,6 @@ def log_metrics(self, metrics, step): ]) def test_loggers_pickle(tmpdir, monkeypatch, logger_class): """Verify that pickling trainer with logger works.""" - tutils.reset_seed() - # prevent comet logger from trying to print at exit, since # pytest's stdout/stderr redirection breaks it import atexit diff --git a/tests/loggers/test_base.py b/tests/loggers/test_base.py index 56f6b97b0caa8..1a52dadf9c621 100644 --- a/tests/loggers/test_base.py +++ b/tests/loggers/test_base.py @@ -7,7 +7,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.loggers import LightningLoggerBase, LoggerCollection from pytorch_lightning.utilities import rank_zero_only -from tests.base import LightningTestModel +from tests.base import LightningTestModel, EvalModelTemplate def test_logger_collection(): @@ -139,7 +139,7 @@ def decorated(metrics, step): return decorated - model, hparams = tutils.get_default_model() + model = EvalModelTemplate(tutils.get_default_hparams()) model.validation_epoch_end = _validation_epoch_end model.training_epoch_end = _training_epoch_end trainer = Trainer( diff --git a/tests/loggers/test_neptune.py b/tests/loggers/test_neptune.py index 4cfdd673478ba..11961234ed41b 100644 --- a/tests/loggers/test_neptune.py +++ b/tests/loggers/test_neptune.py @@ -61,10 +61,7 @@ def test_neptune_additional_methods(neptune): def test_neptune_leave_open_experiment_after_fit(tmpdir): """Verify that neptune experiment was closed after training""" - tutils.reset_seed() - - hparams = tutils.get_default_hparams() - model = LightningTestModel(hparams) + model = LightningTestModel(tutils.get_default_hparams()) def _run_training(logger): logger._experiment = MagicMock() diff --git a/tests/loggers/test_trains.py b/tests/loggers/test_trains.py index e4ee78c65419c..305d0707079b0 100644 --- a/tests/loggers/test_trains.py +++ b/tests/loggers/test_trains.py @@ -8,8 +8,6 @@ def test_trains_logger(tmpdir): """Verify that basic functionality of TRAINS logger works.""" - tutils.reset_seed() - hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) TrainsLogger.set_bypass_mode(True) @@ -33,8 +31,6 @@ def test_trains_logger(tmpdir): def test_trains_pickle(tmpdir): """Verify that pickling trainer with TRAINS logger works.""" - tutils.reset_seed() - # hparams = tutils.get_default_hparams() # model = LightningTestModel(hparams) TrainsLogger.set_bypass_mode(True) diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py index bb2739f95f6e0..82555f88e83cd 100644 --- a/tests/loggers/test_wandb.py +++ b/tests/loggers/test_wandb.py @@ -11,8 +11,6 @@ def test_wandb_logger(wandb): """Verify that basic functionality of wandb logger works. Wandb doesn't work well with pytest so we have to mock it out here.""" - tutils.reset_seed() - logger = WandbLogger(anonymous=True, offline=True) logger.log_metrics({'acc': 1.0}) @@ -38,8 +36,6 @@ def test_wandb_pickle(wandb): Wandb doesn't work well with pytest so we have to mock it out here. """ - tutils.reset_seed() - class Experiment: id = 'the_id' diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py index 3410cdc1d5051..5b9a08c6bd420 100644 --- a/tests/models/data/horovod/train_default_model.py +++ b/tests/models/data/horovod/train_default_model.py @@ -29,7 +29,8 @@ from pytorch_lightning import Trainer # noqa: E402 from pytorch_lightning.callbacks import ModelCheckpoint # noqa: E402 -import tests.base.utils as tutils # noqa: E402 +from tests.base import EvalModelTemplate # noqa: E402 +from tests.base.utils import set_random_master_port, get_default_hparams, run_model_test # noqa: E402 parser = argparse.ArgumentParser() @@ -39,14 +40,13 @@ def run_test_from_config(trainer_options): """Trains the default model with the given config.""" - tutils.reset_seed() - tutils.set_random_master_port() + set_random_master_port() ckpt_path = trainer_options['default_root_dir'] - trainer_options['checkpoint_callback'] = ModelCheckpoint(ckpt_path) + trainer_options.update(checkpoint_callback=ModelCheckpoint(ckpt_path)) - model, hparams = tutils.get_default_model() - tutils.run_model_test(trainer_options, model, on_gpu=args.on_gpu, version=0, with_hpc=False) + model = EvalModelTemplate(get_default_hparams()) + run_model_test(trainer_options, model, on_gpu=args.on_gpu, version=0, with_hpc=False) # Horovod should be initialized following training. If not, this will raise an exception. assert hvd.size() == 2 diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py index 81a2325c09897..f4f1d9c20a6e9 100644 --- a/tests/models/test_amp.py +++ b/tests/models/test_amp.py @@ -6,9 +6,7 @@ import tests.base.utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.base import ( - LightningTestModel, -) +from tests.base import LightningTestModel, EvalModelTemplate @pytest.mark.spawn @@ -18,8 +16,6 @@ def test_amp_single_gpu(tmpdir, backend): """Make sure DP/DDP + AMP work.""" tutils.reset_seed() - model, hparams = tutils.get_default_model() - trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, @@ -28,6 +24,7 @@ def test_amp_single_gpu(tmpdir, backend): precision=16 ) + model = EvalModelTemplate(tutils.get_default_hparams()) # tutils.run_model_test(trainer_options, model) result = trainer.fit(model) @@ -39,10 +36,9 @@ def test_amp_single_gpu(tmpdir, backend): @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_amp_multi_gpu(tmpdir, backend): """Make sure DP/DDP + AMP work.""" - tutils.reset_seed() tutils.set_random_master_port() - model, hparams = tutils.get_default_model() + model = EvalModelTemplate(tutils.get_default_hparams()) trainer_options = dict( default_root_dir=tmpdir, @@ -63,8 +59,6 @@ def test_amp_multi_gpu(tmpdir, backend): @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_amp_gpu_ddp_slurm_managed(tmpdir): """Make sure DDP + AMP work.""" - tutils.reset_seed() - # simulate setting slurm flags tutils.set_random_master_port() os.environ['SLURM_LOCALID'] = str(0) @@ -102,8 +96,6 @@ def test_amp_gpu_ddp_slurm_managed(tmpdir): def test_cpu_model_with_amp(tmpdir): """Make sure model trains on CPU.""" - tutils.reset_seed() - trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, @@ -113,7 +105,7 @@ def test_cpu_model_with_amp(tmpdir): precision=16 ) - model, hparams = tutils.get_default_model() + model = EvalModelTemplate(tutils.get_default_hparams()) with pytest.raises((MisconfigurationException, ModuleNotFoundError)): tutils.run_model_test(trainer_options, model, on_gpu=False) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index e7b422dcb22cf..46d1ba6e44aaf 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -15,13 +15,12 @@ LightTrainDataloader, LightningTestModel, LightTestMixin, + EvalModelTemplate, ) def test_early_stopping_cpu_model(tmpdir): """Test each of the trainer options.""" - tutils.reset_seed() - stopping = EarlyStopping(monitor='val_loss', min_delta=0.1) trainer_options = dict( default_root_dir=tmpdir, @@ -33,7 +32,7 @@ def test_early_stopping_cpu_model(tmpdir): val_percent_check=0.1, ) - model, hparams = tutils.get_default_model() + model = EvalModelTemplate(tutils.get_default_hparams()) tutils.run_model_test(trainer_options, model, on_gpu=False) # test freeze on cpu @@ -49,10 +48,8 @@ def test_early_stopping_cpu_model(tmpdir): reason="Distributed training is not supported on MacOS before Torch 1.3.0") def test_multi_cpu_model_ddp(tmpdir): """Make sure DDP works.""" - tutils.reset_seed() tutils.set_random_master_port() - model, hparams = tutils.get_default_model() trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, @@ -64,13 +61,12 @@ def test_multi_cpu_model_ddp(tmpdir): distributed_backend='ddp_cpu' ) + model = EvalModelTemplate(tutils.get_default_hparams()) tutils.run_model_test(trainer_options, model, on_gpu=False) def test_lbfgs_cpu_model(tmpdir): """Test each of the trainer options.""" - tutils.reset_seed() - trainer_options = dict( default_root_dir=tmpdir, max_epochs=2, @@ -80,15 +76,16 @@ def test_lbfgs_cpu_model(tmpdir): val_percent_check=0.2, ) - model, hparams = tutils.get_default_model(lbfgs=True) - # the test is there for the closure not the performance - tutils.run_model_test_without_loggers(trainer_options, model, min_acc=0.) + hparams = tutils.get_default_hparams() + setattr(hparams, 'optimizer_name', 'lbfgs') + setattr(hparams, 'learning_rate', 0.002) + model = EvalModelTemplate(hparams) + model.configure_optimizers = model.configure_optimizers__lbfgs + tutils.run_model_test_without_loggers(trainer_options, model, min_acc=0.5) def test_default_logger_callbacks_cpu_model(tmpdir): """Test each of the trainer options.""" - tutils.reset_seed() - trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, @@ -99,7 +96,7 @@ def test_default_logger_callbacks_cpu_model(tmpdir): val_percent_check=0.01, ) - model, hparams = tutils.get_default_model() + model = EvalModelTemplate(tutils.get_default_hparams()) tutils.run_model_test_without_loggers(trainer_options, model) # test freeze on cpu @@ -109,8 +106,6 @@ def test_default_logger_callbacks_cpu_model(tmpdir): def test_running_test_after_fitting(tmpdir): """Verify test() on fitted model.""" - tutils.reset_seed() - hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) @@ -143,8 +138,6 @@ def test_running_test_after_fitting(tmpdir): def test_running_test_no_val(tmpdir): """Verify `test()` works on a model with no `val_loader`.""" - tutils.reset_seed() - class CurrentTestModel(LightTrainDataloader, LightTestMixin, TestModelBase): pass @@ -180,8 +173,6 @@ class CurrentTestModel(LightTrainDataloader, LightTestMixin, TestModelBase): @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") def test_single_gpu_batch_parse(): - tutils.reset_seed() - trainer = Trainer() # batch is just a tensor @@ -229,8 +220,6 @@ def test_single_gpu_batch_parse(): def test_simple_cpu(tmpdir): """Verify continue training session on CPU.""" - tutils.reset_seed() - hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) @@ -249,8 +238,6 @@ def test_simple_cpu(tmpdir): def test_cpu_model(tmpdir): """Make sure model trains on CPU.""" - tutils.reset_seed() - trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, @@ -259,15 +246,13 @@ def test_cpu_model(tmpdir): val_percent_check=0.4 ) - model, hparams = tutils.get_default_model() + model = EvalModelTemplate(tutils.get_default_hparams()) tutils.run_model_test(trainer_options, model, on_gpu=False) def test_all_features_cpu_model(tmpdir): """Test each of the trainer options.""" - tutils.reset_seed() - trainer_options = dict( default_root_dir=tmpdir, gradient_clip_val=1.0, @@ -280,14 +265,12 @@ def test_all_features_cpu_model(tmpdir): val_percent_check=0.4 ) - model, hparams = tutils.get_default_model() + model = EvalModelTemplate(tutils.get_default_hparams()) tutils.run_model_test(trainer_options, model, on_gpu=False) def test_tbptt_cpu_model(tmpdir): """Test truncated back propagation through time works.""" - tutils.reset_seed() - truncated_bptt_steps = 2 sequence_size = 30 batch_size = 30 @@ -358,10 +341,6 @@ def train_dataloader(self): @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") def test_single_gpu_model(tmpdir): """Make sure single GPU works (DP mode).""" - tutils.reset_seed() - - model, hparams = tutils.get_default_model() - trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, @@ -371,8 +350,5 @@ def test_single_gpu_model(tmpdir): gpus=1 ) + model = EvalModelTemplate(tutils.get_default_hparams()) tutils.run_model_test(trainer_options, model) - - -# if __name__ == '__main__': -# pytest.main([__file__]) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index dcd90b08ce911..dbaf4db8f8ed2 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -9,7 +9,7 @@ from pytorch_lightning.core import memory from pytorch_lightning.trainer.distrib_parts import parse_gpu_ids, determine_root_gpu_device from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.base import LightningTestModel +from tests.base import LightningTestModel, EvalModelTemplate PRETEND_N_OF_GPUS = 16 @@ -19,11 +19,8 @@ @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_model(tmpdir, backend): """Make sure DDP works.""" - - tutils.reset_seed() tutils.set_random_master_port() - model, hparams = tutils.get_default_model() trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, @@ -33,6 +30,7 @@ def test_multi_gpu_model(tmpdir, backend): distributed_backend=backend, ) + model = EvalModelTemplate(tutils.get_default_hparams()) # tutils.run_model_test(trainer_options, model) trainer = Trainer(**trainer_options) result = trainer.fit(model) @@ -45,31 +43,27 @@ def test_multi_gpu_model(tmpdir, backend): @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_ddp_all_dataloaders_passed_to_fit(tmpdir): """Make sure DDP works with dataloaders passed to fit()""" - - tutils.reset_seed() tutils.set_random_master_port() - model, hparams = tutils.get_default_model() + trainer_options = dict(default_root_dir=tmpdir, + progress_bar_refresh_rate=0, + max_epochs=1, + train_percent_check=0.4, + val_percent_check=0.2, + gpus=[0, 1], + distributed_backend='ddp') - trainer = Trainer( - default_root_dir=tmpdir, - progress_bar_refresh_rate=0, - max_epochs=1, - train_percent_check=0.4, - val_percent_check=0.2, - gpus=[0, 1], - distributed_backend='ddp' - ) - result = trainer.fit(model, - train_dataloader=model.train_dataloader(), - val_dataloaders=model.val_dataloader()) + model = EvalModelTemplate(tutils.get_default_hparams()) + fit_options = dict(train_dataloader=model.train_dataloader(), + val_dataloaders=model.val_dataloader()) + + trainer = Trainer(**trainer_options) + result = trainer.fit(model, **fit_options) assert result == 1, "DDP doesn't work with dataloaders passed to fit()." def test_cpu_slurm_save_load(tmpdir): """Verify model save/load/checkpoint on CPU.""" - tutils.reset_seed() - hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) @@ -139,9 +133,6 @@ def assert_pred_same(): @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_none_backend(tmpdir): """Make sure when using multiple GPUs the user can't use `distributed_backend = None`.""" - tutils.reset_seed() - - model, hparams = tutils.get_default_model() trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, @@ -151,6 +142,7 @@ def test_multi_gpu_none_backend(tmpdir): gpus='-1' ) + model = EvalModelTemplate(tutils.get_default_hparams()) with pytest.warns(UserWarning): tutils.run_model_test(trainer_options, model) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 21a90c191579b..0f41dee6e4fb0 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -40,8 +40,12 @@ def _nccl_available(): def _run_horovod(trainer_options, on_gpu=False): """Execute the training script across multiple workers in parallel.""" - cmdline = ['horovodrun', '-np', '2', sys.executable, TEST_SCRIPT, - '--trainer-options', shlex.quote(json.dumps(trainer_options))] + cmdline = [ + 'horovodrun', + '-np', '2', + sys.executable, TEST_SCRIPT, + '--trainer-options', shlex.quote(json.dumps(trainer_options)) + ] if on_gpu: cmdline += ['--on-gpu'] exit_code = subprocess.call(' '.join(cmdline), shell=True, env=os.environ.copy()) diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py index 0921a3a871f10..af0165d498ab0 100644 --- a/tests/models/test_restore.py +++ b/tests/models/test_restore.py @@ -21,8 +21,6 @@ @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_running_test_pretrained_model_distrib(tmpdir, backend): """Verify `test()` on pretrained model.""" - - tutils.reset_seed() tutils.set_random_master_port() hparams = tutils.get_default_hparams() @@ -74,8 +72,6 @@ def test_running_test_pretrained_model_distrib(tmpdir, backend): def test_running_test_pretrained_model_cpu(tmpdir): """Verify test() on pretrained model.""" - tutils.reset_seed() - hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) @@ -113,8 +109,6 @@ def test_running_test_pretrained_model_cpu(tmpdir): def test_load_model_from_checkpoint(tmpdir): """Verify test() on pretrained model.""" - tutils.reset_seed() - hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) @@ -157,9 +151,6 @@ def test_load_model_from_checkpoint(tmpdir): @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_dp_resume(tmpdir): """Make sure DP continues training correctly.""" - - tutils.reset_seed() - hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) @@ -232,8 +223,6 @@ def assert_good_acc(): def test_model_saving_loading(tmpdir): """Tests use case where trainer saves the model, and user loads it from tags independently.""" - tutils.reset_seed() - hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) diff --git a/tests/trainer/test_checks.py b/tests/trainer/test_checks.py index d69ec1e6e243a..7a68b2ab13bca 100755 --- a/tests/trainer/test_checks.py +++ b/tests/trainer/test_checks.py @@ -15,7 +15,6 @@ def test_error_on_no_train_step(tmpdir): """ Test that an error is thrown when no `training_step()` is defined """ - tutils.reset_seed() class CurrentTestModel(LightningModule): def forward(self, x): @@ -30,7 +29,6 @@ def forward(self, x): def test_error_on_no_train_dataloader(tmpdir): """ Test that an error is thrown when no `training_dataloader()` is defined """ - tutils.reset_seed() hparams = tutils.get_default_hparams() class CurrentTestModel(TestModelBase): @@ -45,7 +43,6 @@ class CurrentTestModel(TestModelBase): def test_error_on_no_configure_optimizers(tmpdir): """ Test that an error is thrown when no `configure_optimizers()` is defined """ - tutils.reset_seed() class CurrentTestModel(LightTrainDataloader, LightningModule): def forward(self, x): @@ -68,7 +65,6 @@ def test_warning_on_wrong_validation_settings(tmpdir): throw warning if `val_epoch_end()` is not defined * error if `validation_step()` is overriden but `val_dataloader()` is not """ - tutils.reset_seed() hparams = tutils.get_default_hparams() trainer = Trainer(default_root_dir=tmpdir, max_epochs=1) @@ -111,7 +107,6 @@ def test_warning_on_wrong_test_settigs(tmpdir): throw warning if `test_epoch_end()` is not defined * error if `test_step()` is overriden but `test_dataloader()` is not """ - tutils.reset_seed() hparams = tutils.get_default_hparams() trainer = Trainer(default_root_dir=tmpdir, max_epochs=1) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 83ff481d694f9..0d528474d38c2 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -55,7 +55,6 @@ class CurrentTestModel( def test_multiple_val_dataloader(tmpdir): """Verify multiple val_dataloader.""" - tutils.reset_seed() class CurrentTestModel( LightTrainDataloader, @@ -90,7 +89,6 @@ class CurrentTestModel( def test_multiple_test_dataloader(tmpdir): """Verify multiple test_dataloader.""" - tutils.reset_seed() class CurrentTestModel( LightTrainDataloader, @@ -127,7 +125,6 @@ class CurrentTestModel( def test_train_dataloaders_passed_to_fit(tmpdir): """Verify that train dataloader can be passed to fit """ - tutils.reset_seed() class CurrentTestModel(LightTrainDataloader, TestModelBase): pass @@ -149,7 +146,6 @@ class CurrentTestModel(LightTrainDataloader, TestModelBase): def test_train_val_dataloaders_passed_to_fit(tmpdir): """ Verify that train & val dataloader can be passed to fit """ - tutils.reset_seed() class CurrentTestModel( LightTrainDataloader, @@ -178,7 +174,6 @@ class CurrentTestModel( def test_all_dataloaders_passed_to_fit(tmpdir): """Verify train, val & test dataloader can be passed to fit """ - tutils.reset_seed() class CurrentTestModel( LightTrainDataloader, @@ -215,7 +210,6 @@ class CurrentTestModel( def test_multiple_dataloaders_passed_to_fit(tmpdir): """Verify that multiple val & test dataloaders can be passed to fit.""" - tutils.reset_seed() class CurrentTestModel( LightningTestModel, @@ -252,7 +246,6 @@ class CurrentTestModel( def test_mixing_of_dataloader_options(tmpdir): """Verify that dataloaders can be passed to fit""" - tutils.reset_seed() class CurrentTestModel( LightTrainDataloader, @@ -294,7 +287,6 @@ class CurrentTestModel( def test_inf_train_dataloader(tmpdir): """Test inf train data loader (e.g. IterableDataset)""" - tutils.reset_seed() class CurrentTestModel( LightInfTrainDataloader, @@ -336,7 +328,6 @@ class CurrentTestModel( def test_inf_val_dataloader(tmpdir): """Test inf val data loader (e.g. IterableDataset)""" - tutils.reset_seed() class CurrentTestModel( LightInfValDataloader, @@ -369,7 +360,6 @@ class CurrentTestModel( def test_inf_test_dataloader(tmpdir): """Test inf test data loader (e.g. IterableDataset)""" - tutils.reset_seed() class CurrentTestModel( LightInfTestDataloader, @@ -404,7 +394,6 @@ class CurrentTestModel( def test_error_on_zero_len_dataloader(tmpdir): """ Test that error is raised if a zero-length dataloader is defined """ - tutils.reset_seed() class CurrentTestModel( LightZeroLenDataloader, @@ -428,7 +417,6 @@ class CurrentTestModel( @pytest.mark.skipif(platform.system() == 'Windows', reason='Does not apply to Windows platform.') def test_warning_with_few_workers(tmpdir): """ Test that error is raised if dataloader with only a few workers is used """ - tutils.reset_seed() class CurrentTestModel( LightTrainDataloader, diff --git a/tests/trainer/test_lr_finder.py b/tests/trainer/test_lr_finder.py index 96fe461e99a52..ba6e9c336b130 100755 --- a/tests/trainer/test_lr_finder.py +++ b/tests/trainer/test_lr_finder.py @@ -12,8 +12,7 @@ def test_error_on_more_than_1_optimizer(tmpdir): - ''' Check that error is thrown when more than 1 optimizer is passed ''' - tutils.reset_seed() + """ Check that error is thrown when more than 1 optimizer is passed """ class CurrentTestModel( LightTestMultipleOptimizersWithSchedulingMixin, @@ -36,8 +35,7 @@ class CurrentTestModel( def test_model_reset_correctly(tmpdir): - ''' Check that model weights are correctly reset after lr_find() ''' - tutils.reset_seed() + """ Check that model weights are correctly reset after lr_find() """ class CurrentTestModel( LightTrainDataloader, @@ -66,8 +64,7 @@ class CurrentTestModel( def test_trainer_reset_correctly(tmpdir): - ''' Check that all trainer parameters are reset correctly after lr_find() ''' - tutils.reset_seed() + """ Check that all trainer parameters are reset correctly after lr_find() """ class CurrentTestModel( LightTrainDataloader, @@ -104,7 +101,6 @@ class CurrentTestModel( def test_trainer_arg_bool(tmpdir): - tutils.reset_seed() class CurrentTestModel( LightTrainDataloader, @@ -129,7 +125,6 @@ class CurrentTestModel( def test_trainer_arg_str(tmpdir): - tutils.reset_seed() class CurrentTestModel( LightTrainDataloader, @@ -155,7 +150,6 @@ class CurrentTestModel( def test_call_to_trainer_method(tmpdir): - tutils.reset_seed() class CurrentTestModel( LightTrainDataloader, diff --git a/tests/trainer/test_optimizers.py b/tests/trainer/test_optimizers.py index b445dcb2f7173..be0ac5471d24c 100644 --- a/tests/trainer/test_optimizers.py +++ b/tests/trainer/test_optimizers.py @@ -12,13 +12,12 @@ LightTestMultipleOptimizersWithSchedulingMixin, LightTestOptimizersWithMixedSchedulingMixin, LightTestReduceLROnPlateauMixin, - LightTestNoneOptimizerMixin + LightTestNoneOptimizerMixin, EvalModelTemplate ) def test_optimizer_with_scheduling(tmpdir): """ Verify that learning rate scheduling is working """ - tutils.reset_seed() class CurrentTestModel( LightTestOptimizerWithSchedulingMixin, @@ -54,7 +53,6 @@ class CurrentTestModel( def test_multi_optimizer_with_scheduling(tmpdir): """ Verify that learning rate scheduling is working """ - tutils.reset_seed() class CurrentTestModel( LightTestMultipleOptimizersWithSchedulingMixin, @@ -94,7 +92,6 @@ class CurrentTestModel( def test_multi_optimizer_with_scheduling_stepping(tmpdir): - tutils.reset_seed() class CurrentTestModel( LightTestOptimizersWithMixedSchedulingMixin, @@ -138,7 +135,6 @@ class CurrentTestModel( def test_reduce_lr_on_plateau_scheduling(tmpdir): - tutils.reset_seed() class CurrentTestModel( LightTestReduceLROnPlateauMixin, @@ -168,10 +164,9 @@ class CurrentTestModel( def test_optimizer_return_options(): - tutils.reset_seed() trainer = Trainer() - model, hparams = tutils.get_default_model() + model = EvalModelTemplate(tutils.get_default_hparams()) # single optimizer opt_a = torch.optim.Adam(model.parameters(), lr=0.002) @@ -226,11 +221,10 @@ def test_optimizer_return_options(): def test_none_optimizer_warning(): - tutils.reset_seed() trainer = Trainer() - model, hparams = tutils.get_default_model() + model = EvalModelTemplate(tutils.get_default_hparams()) model.configure_optimizers = lambda: None with pytest.warns(UserWarning, match='will run with no optimizer'): @@ -238,7 +232,6 @@ def test_none_optimizer_warning(): def test_none_optimizer(tmpdir): - tutils.reset_seed() class CurrentTestModel( LightTestNoneOptimizerMixin, diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index b7344a70b3e40..d72c04ed80e36 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1,6 +1,7 @@ import glob import math import os +import types from argparse import Namespace import pytest @@ -22,7 +23,7 @@ LightValidationMultipleDataloadersMixin, LightTrainDataloader, LightTestDataloader, - LightValidationMixin, + LightValidationMixin, EvalModelTemplate, ) @@ -53,7 +54,6 @@ def test_hparams_save_load(tmpdir): def test_no_val_module(tmpdir): """Tests use case where trainer saves the model, and user loads it from tags independently.""" - tutils.reset_seed() hparams = tutils.get_default_hparams() @@ -92,7 +92,6 @@ class CurrentTestModel(LightTrainDataloader, TestModelBase): def test_no_val_end_module(tmpdir): """Tests use case where trainer saves the model, and user loads it from tags independently.""" - tutils.reset_seed() class CurrentTestModel(LightTrainDataloader, LightValidationStepMixin, TestModelBase): pass @@ -132,7 +131,6 @@ def test_gradient_accumulation_scheduling(tmpdir): """ Test grad accumulation by the freq of optimizer updates """ - tutils.reset_seed() # test incorrect configs with pytest.raises(IndexError): @@ -205,7 +203,6 @@ def _optimizer_step(self, epoch, batch_idx, optimizer, def test_loading_meta_tags(tmpdir): - tutils.reset_seed() hparams = tutils.get_default_hparams() @@ -225,7 +222,6 @@ def test_loading_meta_tags(tmpdir): def test_dp_output_reduce(): mixin = TrainerLoggingMixin() - tutils.reset_seed() # test identity when we have a single gpu out = torch.rand(3, 1) @@ -291,7 +287,6 @@ def mock_save_function(filepath): def test_model_freeze_unfreeze(): - tutils.reset_seed() hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) @@ -300,11 +295,8 @@ def test_model_freeze_unfreeze(): model.unfreeze() -def test_resume_from_checkpoint(tmpdir): - """Verify resuming from checkpoint (epoch, batch numbers and on_load_checkpoint())""" - import types - - tutils.reset_seed() +def test_resume_from_checkpoint_epoch_restored(tmpdir): + """Verify resuming from checkpoint runs the right number of epochs""" hparams = tutils.get_default_hparams() @@ -371,8 +363,7 @@ def increment_on_load_checkpoint(self, _): def _init_steps_model(): """private method for initializing a model with 5% train epochs""" - tutils.reset_seed() - model, _ = tutils.get_default_model() + model = EvalModelTemplate(tutils.get_default_hparams()) # define train epoch to 5% of data train_percent = 0.5 @@ -460,7 +451,6 @@ def test_trainer_min_steps_and_epochs(tmpdir): def test_benchmark_option(tmpdir): """Verify benchmark option.""" - tutils.reset_seed() class CurrentTestModel( LightValidationMultipleDataloadersMixin, @@ -523,7 +513,6 @@ def test_epoch_end(self, outputs): def test_disabled_validation(): """Verify that `val_percent_check=0` disables the validation loop unless `fast_dev_run=True`.""" - tutils.reset_seed() class CurrentModel(LightTrainDataloader, LightValidationMixin, TestModelBase): @@ -666,7 +655,6 @@ def test_gradient_clipping(tmpdir): """ Test gradient clipping """ - tutils.reset_seed() hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) diff --git a/tests/trainer/test_trainer_cli.py b/tests/trainer/test_trainer_cli.py index b2d1da957bc17..c4c23d0fff4ed 100644 --- a/tests/trainer/test_trainer_cli.py +++ b/tests/trainer/test_trainer_cli.py @@ -13,7 +13,6 @@ return_value=Namespace(**Trainer.default_attributes())) def test_default_args(tmpdir): """Tests default argument parser for Trainer""" - tutils.reset_seed() # logger file to get meta logger = tutils.get_default_logger(tmpdir) From cf0d5dc470c581580dc017124f40caef845be0e8 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Sat, 2 May 2020 14:40:31 +0200 Subject: [PATCH 13/43] Docker release (#1613) * Update docker_builds.yml * Update docker_builds.yml * nightly Co-authored-by: Jirka Borovec --- .github/workflows/docker_builds.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/docker_builds.yml b/.github/workflows/docker_builds.yml index 1ac289ba61f7d..736ff72460d74 100644 --- a/.github/workflows/docker_builds.yml +++ b/.github/workflows/docker_builds.yml @@ -22,7 +22,7 @@ jobs: - uses: actions/checkout@v2 - name: Publish Releases to Docker # only on releases - uses: elgohr/Publish-Docker-Github-Action@master + uses: elgohr/Publish-Docker-Github-Action@2.14 if: contains(github.ref, 'refs/tags/') && !contains(${{ steps.get_version.outputs.VERSION }}, 'rc') %% !contains(${{ steps.get_version.outputs.VERSION }}, 'dev') with: name: pytorchlightning/pytorch_lightning @@ -30,10 +30,10 @@ jobs: password: ${{ secrets.DOCKER_PASSWORD }} dockerfile: docker/Dockerfile buildargs: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.VERSION }} - tags: "${{ steps.get_version.outputs.VERSION }}_py${{ matrix.python_version }}_torch${{ matrix.pytorch_version }},stable_py${{ matrix.python_version }}_torch${{ matrix.pytorch_version }}" + tags: "${{ steps.get_version.outputs.VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},stable-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}" - name: Publish Master # publish master - uses: elgohr/Publish-Docker-Github-Action@master + uses: elgohr/Publish-Docker-Github-Action@2.14 if: github.event_name == 'push' with: name: pytorchlightning/pytorch_lightning @@ -41,4 +41,4 @@ jobs: password: ${{ secrets.DOCKER_PASSWORD }} dockerfile: docker/Dockerfile buildargs: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.VERSION }} - tags: "latest_py${{ matrix.python_version }}_torch${{ matrix.pytorch_version }}" + tags: "nightly-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}" From 210cd657dd0f83069b8c6abc7402508f354668b3 Mon Sep 17 00:00:00 2001 From: Dmitry Lipin Date: Sat, 2 May 2020 15:41:37 +0300 Subject: [PATCH 14/43] fix LightningTemplateModel (#1577) * fix LightningTemplateModel * update CHANGELOG.md * update LightningTemplate * update changelog * update changelog * loss fix --- CHANGELOG.md | 1 + pl_examples/models/lightning_template.py | 168 ++++------------------- 2 files changed, 30 insertions(+), 139 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 94675a20111c0..aa39cfcc615c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -69,6 +69,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Defines shared proc. rank, remove rank from instances (e.g. loggers) ([#1408](https://github.com/PyTorchLightning/pytorch-lightning/pull/1408)) - Updated semantic segmentation example with custom U-Net and logging ([#1371](https://github.com/PyTorchLightning/pytorch-lightning/pull/1371)) - Disabled val and test shuffling ([#1600](https://github.com/PyTorchLightning/pytorch-lightning/pull/1600)) +- Updated LightningTemplateModel to look more like Colab example ([#1546](https://github.com/PyTorchLightning/pytorch-lightning/pull/1577)) ### Deprecated diff --git a/pl_examples/models/lightning_template.py b/pl_examples/models/lightning_template.py index c5a7f9396f287..13b3bc67a912b 100644 --- a/pl_examples/models/lightning_template.py +++ b/pl_examples/models/lightning_template.py @@ -46,22 +46,6 @@ def __init__(self, hparams): # init superclass super().__init__() self.hparams = hparams - - self.batch_size = hparams.batch_size - - # if you specify an example input, the summary will show input/output for each layer - self.example_input_array = torch.rand(5, 28 * 28) - - # build model - self.__build_model() - - # --------------------- - # MODEL SETUP - # --------------------- - def __build_model(self): - """ - Layout the model. - """ self.c_d1 = nn.Linear(in_features=self.hparams.in_features, out_features=self.hparams.hidden_dim) self.c_d1_bn = nn.BatchNorm1d(self.hparams.hidden_dim) @@ -70,27 +54,17 @@ def __build_model(self): self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim, out_features=self.hparams.out_features) - # --------------------- - # TRAINING - # --------------------- def forward(self, x): """ No special modification required for Lightning, define it as you normally would in the `nn.Module` in vanilla PyTorch. """ - x = self.c_d1(x) + x = self.c_d1(x.view(x.size(0), -1)) x = torch.tanh(x) x = self.c_d1_bn(x) x = self.c_d1_drop(x) - x = self.c_d2(x) - logits = F.log_softmax(x, dim=1) - - return logits - - def loss(self, labels, logits): - nll = F.nll_loss(logits, labels) - return nll + return x def training_step(self, batch, batch_idx): """ @@ -99,22 +73,10 @@ def training_step(self, batch, batch_idx): """ # forward pass x, y = batch - x = x.view(x.size(0), -1) - y_hat = self(x) - - # calculate loss - loss_val = self.loss(y, y_hat) - - tqdm_dict = {'train_loss': loss_val} - output = OrderedDict({ - 'loss': loss_val, - 'progress_bar': tqdm_dict, - 'log': tqdm_dict - }) - - # can also return just a scalar instead of a dict (return loss_val) - return output + loss = F.cross_entropy(y_hat, y) + tensorboard_logs = {'train_loss': loss} + return {'loss': loss, 'log': tensorboard_logs} def validation_step(self, batch, batch_idx): """ @@ -122,58 +84,35 @@ def validation_step(self, batch, batch_idx): passed in as `batch`. """ x, y = batch - x = x.view(x.size(0), -1) y_hat = self(x) - - loss_val = self.loss(y, y_hat) - - # acc + val_loss = F.cross_entropy(y_hat, y) labels_hat = torch.argmax(y_hat, dim=1) - val_acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0) - val_acc = torch.tensor(val_acc) - - if self.on_gpu: - val_acc = val_acc.cuda(loss_val.device.index) - - output = OrderedDict({ - 'val_loss': loss_val, - 'val_acc': val_acc, - }) + n_correct_pred = torch.sum(y == labels_hat).item() + return {'val_loss': val_loss, "n_correct_pred": n_correct_pred, "n_pred": len(x)} - # can also return just a scalar instead of a dict (return loss_val) - return output + def test_step(self, batch, batch_idx): + x, y = batch + y_hat = self(x) + test_loss = F.cross_entropy(y_hat, y) + labels_hat = torch.argmax(y_hat, dim=1) + n_correct_pred = torch.sum(y == labels_hat).item() + return {'test_loss': test_loss, "n_correct_pred": n_correct_pred, "n_pred": len(x)} def validation_epoch_end(self, outputs): """ Called at the end of validation to aggregate outputs. :param outputs: list of individual outputs of each validation step. """ - # if returned a scalar from validation_step, outputs is a list of tensor scalars - # we return just the average in this case (if we want) - # return torch.stack(outputs).mean() - - val_loss_mean = 0 - val_acc_mean = 0 - for output in outputs: - val_loss = output['val_loss'] - - # reduce manually when using dp - if self.trainer.use_dp or self.trainer.use_ddp2: - val_loss = torch.mean(val_loss) - val_loss_mean += val_loss + avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() + val_acc = sum([x['n_correct_pred'] for x in outputs]) / sum(x['n_pred'] for x in outputs) + tensorboard_logs = {'val_loss': avg_loss, 'val_acc': val_acc} + return {'val_loss': avg_loss, 'log': tensorboard_logs} - # reduce manually when using dp - val_acc = output['val_acc'] - if self.trainer.use_dp or self.trainer.use_ddp2: - val_acc = torch.mean(val_acc) - - val_acc_mean += val_acc - - val_loss_mean /= len(outputs) - val_acc_mean /= len(outputs) - tqdm_dict = {'val_loss': val_loss_mean, 'val_acc': val_acc_mean} - result = {'progress_bar': tqdm_dict, 'log': tqdm_dict, 'val_loss': val_loss_mean} - return result + def test_epoch_end(self, outputs): + avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean() + test_acc = sum([x['n_correct_pred'] for x in outputs]) / sum(x['n_pred'] for x in outputs) + tensorboard_logs = {'test_loss': avg_loss, 'test_acc': test_acc} + return {'test_loss': avg_loss, 'log': tensorboard_logs} # --------------------- # TRAINING SETUP @@ -187,72 +126,23 @@ def configure_optimizers(self): scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10) return [optimizer], [scheduler] - def __dataloader(self, train): - # this is neede when you want some info about dataset before binding to trainer - self.prepare_data() - # init data generators - transform = transforms.Compose([transforms.ToTensor(), - transforms.Normalize((0.5,), (1.0,))]) - dataset = MNIST(root=self.hparams.data_root, train=train, - transform=transform, download=False) - - # when using multi-node (ddp) we need to add the datasampler - batch_size = self.hparams.batch_size - - loader = DataLoader( - dataset=dataset, - batch_size=batch_size, - num_workers=0 - ) - - return loader - def prepare_data(self): transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))]) - _ = MNIST(root=self.hparams.data_root, train=True, - transform=transform, download=True) + self.mnist_train = MNIST(self.hparams.data_root, train=True, download=True, transform=transform) + self.mnist_test = MNIST(self.hparams.data_root, train=False, download=True, transform=transform) def train_dataloader(self): log.info('Training data loader called.') - return self.__dataloader(train=True) + return DataLoader(self.mnist_train, batch_size=self.hparams.batch_size, num_workers=4) def val_dataloader(self): log.info('Validation data loader called.') - return self.__dataloader(train=False) + return DataLoader(self.mnist_test, batch_size=self.hparams.batch_size, num_workers=4) def test_dataloader(self): log.info('Test data loader called.') - return self.__dataloader(train=False) - - def test_step(self, batch, batch_idx): - """ - Lightning calls this during testing, similar to `validation_step`, - with the data from the test dataloader passed in as `batch`. - """ - output = self.validation_step(batch, batch_idx) - # Rename output keys - output['test_loss'] = output.pop('val_loss') - output['test_acc'] = output.pop('val_acc') - - return output - - def test_epoch_end(self, outputs): - """ - Called at the end of test to aggregate outputs, similar to `validation_epoch_end`. - :param outputs: list of individual outputs of each test step - """ - results = self.validation_step_end(outputs) - - # rename some keys - results['progress_bar'].update({ - 'test_loss': results['progress_bar'].pop('val_loss'), - 'test_acc': results['progress_bar'].pop('val_acc'), - }) - results['log'] = results['progress_bar'] - results['test_loss'] = results.pop('val_loss') - - return results + return DataLoader(self.mnist_test, batch_size=self.hparams.batch_size, num_workers=4) @staticmethod def add_model_specific_args(parent_parser, root_dir): # pragma: no-cover From 4dc77b5a1a6b4b72c1498375d71044589c767741 Mon Sep 17 00:00:00 2001 From: Jacob Zhong Date: Sat, 2 May 2020 08:44:35 -0400 Subject: [PATCH 15/43] Change lightning module params to dict when loading (#1639) * change module params to dict * tiny change * reverse --- pytorch_lightning/core/lightning.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index fc88fb8c78687..2f1de6412f0f0 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1550,8 +1550,9 @@ def _load_model_state(cls, checkpoint: Dict[str, Any], *args, **kwargs) -> 'Ligh ) # load the state_dict on the model automatically - model_args = [hparams] if hparams else [] - model = cls(*model_args, *args, **kwargs) + if hparams: + kwargs.update(hparams=hparams) + model = cls(*args, **kwargs) model.load_state_dict(checkpoint['state_dict']) # give model a chance to load something From 152a2eb30ce82deefdb738b81fda66a9c218ed76 Mon Sep 17 00:00:00 2001 From: Oliver Neumann Date: Sat, 2 May 2020 14:50:47 +0200 Subject: [PATCH 16/43] wandb logger 'global_step' affects other logger (#1492) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Removed unnecessary 'global_step' from wandb logger. * Fixed wrong step implementation in wandb and missing metric skipping in logger base. * simplified metric check in base logger * Added Fix Description in CHANGELOG.md * Updated wandb logger tests. * udpate test, step=3 * Moved Fix Description in CHANGELOG.md to unreleased. * Update CHANGELOG.md Co-authored-by: Adrian Wälchli Co-authored-by: Jirka Borovec --- CHANGELOG.md | 1 + pytorch_lightning/loggers/base.py | 2 +- pytorch_lightning/loggers/wandb.py | 4 +--- tests/loggers/test_wandb.py | 4 ++-- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index aa39cfcc615c1..f67ba54900cdc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed Horovod distributed backend to set the `root_gpu` property ([#1669](https://github.com/PyTorchLightning/pytorch-lightning/pull/1669)) +- Fixed wandb logger `global_step` affects other loggers ([#1492](https://github.com/PyTorchLightning/pytorch-lightning/issues/1485)) ## [0.7.5] - 2020-04-27 diff --git a/pytorch_lightning/loggers/base.py b/pytorch_lightning/loggers/base.py index 39891c447bba2..857d661fdb5b3 100644 --- a/pytorch_lightning/loggers/base.py +++ b/pytorch_lightning/loggers/base.py @@ -125,7 +125,7 @@ def agg_and_log_metrics(self, metrics: Dict[str, float], step: Optional[int] = N """ agg_step, metrics_to_log = self._aggregate_metrics(metrics=metrics, step=step) - if metrics_to_log is not None: + if metrics_to_log: self.log_metrics(metrics=metrics_to_log, step=agg_step) @abstractmethod diff --git a/pytorch_lightning/loggers/wandb.py b/pytorch_lightning/loggers/wandb.py index c348644141fca..0d5ff9855a40d 100644 --- a/pytorch_lightning/loggers/wandb.py +++ b/pytorch_lightning/loggers/wandb.py @@ -119,9 +119,7 @@ def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: @rank_zero_only def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: - if step is not None: - metrics['global_step'] = step - self.experiment.log(metrics) + self.experiment.log(metrics, step=step) @property def name(self) -> str: diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py index 82555f88e83cd..3a63fcb9da57e 100644 --- a/tests/loggers/test_wandb.py +++ b/tests/loggers/test_wandb.py @@ -14,11 +14,11 @@ def test_wandb_logger(wandb): logger = WandbLogger(anonymous=True, offline=True) logger.log_metrics({'acc': 1.0}) - wandb.init().log.assert_called_once_with({'acc': 1.0}) + wandb.init().log.assert_called_once_with({'acc': 1.0}, step=None) wandb.init().log.reset_mock() logger.log_metrics({'acc': 1.0}, step=3) - wandb.init().log.assert_called_once_with({'global_step': 3, 'acc': 1.0}) + wandb.init().log.assert_called_once_with({'acc': 1.0}, step=3) logger.log_hyperparams({'test': None}) wandb.init().config.update.assert_called_once_with({'test': None}, allow_val_change=True) From d06d5e68b6c2e2fbd254fd01e701f2ff0469eea6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 2 May 2020 15:08:21 +0200 Subject: [PATCH 17/43] Fix typo in progress bar docs (#1680) * fix typo * Typo * typo Borda Co-authored-by: Jirka Borovec --- pytorch_lightning/callbacks/progress.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/callbacks/progress.py b/pytorch_lightning/callbacks/progress.py index a397c0d2c8d56..a770c6c9d95e7 100644 --- a/pytorch_lightning/callbacks/progress.py +++ b/pytorch_lightning/callbacks/progress.py @@ -24,10 +24,10 @@ class LitProgressBar(ProgressBarBase): def __init__(self): super().__init__() # don't forget this :) - self.enabled = True + self.enable = True def disable(self): - self.enableenabled = False + self.enable = False def on_batch_end(self, trainer, pl_module): super().on_batch_end(trainer, pl_module) # don't forget this :) From fafe5d63a70a422fb8c8892c6f0a10c2c6f23816 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste SCHIRATTI Date: Sat, 2 May 2020 15:08:46 +0200 Subject: [PATCH 18/43] Transfer learning example (#1564) * Fine tuning example. * Fix (in train method) + Borda's comments (added argparse + fixed docstrings). * Updated CHANGELOG.md * Fix + updated docstring. * Fixes (awaelchli's comments) + docstrings. * Fix train/val loss. * Fix. --- CHANGELOG.md | 2 + .../computer_vision_fine_tuning.py | 440 ++++++++++++++++++ 2 files changed, 442 insertions(+) create mode 100644 pl_examples/domain_templates/computer_vision_fine_tuning.py diff --git a/CHANGELOG.md b/CHANGELOG.md index f67ba54900cdc..ef025c6a856a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added callback for logging learning rates ([#1498](https://github.com/PyTorchLightning/pytorch-lightning/pull/1498)) +- Added transfer learning example (for a binary classification task in computer vision) ([#1564](https://github.com/PyTorchLightning/pytorch-lightning/pull/1564)) + ### Changed ### Deprecated diff --git a/pl_examples/domain_templates/computer_vision_fine_tuning.py b/pl_examples/domain_templates/computer_vision_fine_tuning.py new file mode 100644 index 0000000000000..42a0a936d9e34 --- /dev/null +++ b/pl_examples/domain_templates/computer_vision_fine_tuning.py @@ -0,0 +1,440 @@ +"""Computer vision example on Transfer Learning. + +This computer vision example illustrates how one could fine-tune a pre-trained +network (by default, a ResNet50 is used) using pytorch-lightning. For the sake +of this example, the 'cats and dogs dataset' (~60MB, see `DATA_URL` below) and +the proposed network (denoted by `TransferLearningModel`, see below) is +trained for 15 epochs. The training consists in three stages. From epoch 0 to +4, the feature extractor (the pre-trained network) is frozen except maybe for +the BatchNorm layers (depending on whether `train_bn = True`). The BatchNorm +layers (if `train_bn = True`) and the parameters of the classifier are trained +as a single parameters group with lr = 1e-2. From epoch 5 to 9, the last two +layer groups of the pre-trained network are unfrozen and added to the +optimizer as a new parameter group with lr = 1e-4 (while lr = 1e-3 for the +first parameter group in the optimizer). Eventually, from epoch 10, all the +remaining layer groups of the pre-trained network are unfrozen and added to +the optimizer as a third parameter group. From epoch 10, the parameters of the +pre-trained network are trained with lr = 1e-5 while those of the classifier +are trained with lr = 1e-4. + +Note: + See: https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html +""" + +import argparse +from collections import OrderedDict +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Optional, Generator, Union + +import pytorch_lightning as pl +import torch +import torch.nn.functional as F +from pytorch_lightning import _logger as log +from torch import optim +from torch.optim.lr_scheduler import MultiStepLR +from torch.optim.optimizer import Optimizer +from torch.utils.data import DataLoader +from torchvision import models +from torchvision import transforms +from torchvision.datasets import ImageFolder +from torchvision.datasets.utils import download_and_extract_archive + +BN_TYPES = (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d) +DATA_URL = 'https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip' + + +# --- Utility functions --- + + +def _make_trainable(module: torch.nn.Module) -> None: + """Unfreezes a given module. + + Args: + module: The module to unfreeze + """ + for param in module.parameters(): + param.requires_grad = True + module.train() + + +def _recursive_freeze(module: torch.nn.Module, + train_bn: bool = True) -> None: + """Freezes the layers of a given module. + + Args: + module: The module to freeze + train_bn: If True, leave the BatchNorm layers in training mode + """ + children = list(module.children()) + if not children: + if not (isinstance(module, BN_TYPES) and train_bn): + for param in module.parameters(): + param.requires_grad = False + module.eval() + else: + # Make the BN layers trainable + _make_trainable(module) + else: + for child in children: + _recursive_freeze(module=child, train_bn=train_bn) + + +def freeze(module: torch.nn.Module, + n: Optional[int] = None, + train_bn: bool = True) -> None: + """Freezes the layers up to index n (if n is not None). + + Args: + module: The module to freeze (at least partially) + n: Max depth at which we stop freezing the layers. If None, all + the layers of the given module will be frozen. + train_bn: If True, leave the BatchNorm layers in training mode + """ + children = list(module.children()) + n_max = len(children) if n is None else int(n) + + for child in children[:n_max]: + _recursive_freeze(module=child, train_bn=train_bn) + + for child in children[n_max:]: + _make_trainable(module=child) + + +def filter_params(module: torch.nn.Module, + train_bn: bool = True) -> Generator: + """Yields the trainable parameters of a given module. + + Args: + module: A given module + train_bn: If True, leave the BatchNorm layers in training mode + + Returns: + Generator + """ + children = list(module.children()) + if not children: + if not (isinstance(module, BN_TYPES) and train_bn): + for param in module.parameters(): + if param.requires_grad: + yield param + else: + for child in children: + for param in filter_params(module=child, train_bn=train_bn): + yield param + + +def _unfreeze_and_add_param_group(module: torch.nn.Module, + optimizer: Optimizer, + lr: Optional[float] = None, + train_bn: bool = True): + """Unfreezes a module and adds its parameters to an optimizer.""" + _make_trainable(module) + params_lr = optimizer.param_groups[0]['lr'] if lr is None else float(lr) + optimizer.add_param_group( + {'params': filter_params(module=module, train_bn=train_bn), + 'lr': params_lr / 10., + }) + + +# --- Pytorch-lightning module --- + + +class TransferLearningModel(pl.LightningModule): + """Transfer Learning with pre-trained ResNet50. + + Args: + hparams: Model hyperparameters + dl_path: Path where the data will be downloaded + """ + def __init__(self, + hparams: argparse.Namespace, + dl_path: Union[str, Path]) -> None: + super().__init__() + self.hparams = hparams + self.dl_path = dl_path + self.__build_model() + + def __build_model(self): + """Define model layers & loss.""" + + # 1. Load pre-trained network: + model_func = getattr(models, self.hparams.backbone) + backbone = model_func(pretrained=True) + + _layers = list(backbone.children())[:-1] + self.feature_extractor = torch.nn.Sequential(*_layers) + freeze(module=self.feature_extractor, train_bn=self.hparams.train_bn) + + # 2. Classifier: + _fc_layers = [torch.nn.Linear(2048, 256), + torch.nn.Linear(256, 32), + torch.nn.Linear(32, 1)] + self.fc = torch.nn.Sequential(*_fc_layers) + + # 3. Loss: + self.loss_func = F.binary_cross_entropy_with_logits + + def forward(self, x): + """Forward pass. Returns logits.""" + + # 1. Feature extraction: + x = self.feature_extractor(x) + x = x.squeeze(-1).squeeze(-1) + + # 2. Classifier (returns logits): + x = self.fc(x) + + return x + + def loss(self, labels, logits): + return self.loss_func(input=logits, target=labels) + + def train(self, mode=True): + super().train(mode=mode) + + epoch = self.current_epoch + if epoch < self.hparams.milestones[0] and mode: + # feature extractor is frozen (except for BatchNorm layers) + freeze(module=self.feature_extractor, + train_bn=self.hparams.train_bn) + + elif self.hparams.milestones[0] <= epoch < self.hparams.milestones[1] and mode: + # Unfreeze last two layers of the feature extractor + freeze(module=self.feature_extractor, + n=-2, + train_bn=self.hparams.train_bn) + + def on_epoch_start(self): + """Use `on_epoch_start` to unfreeze layers progressively.""" + optimizer = self.trainer.optimizers[0] + if self.current_epoch == self.hparams.milestones[0]: + _unfreeze_and_add_param_group(module=self.feature_extractor[-2:], + optimizer=optimizer, + train_bn=self.hparams.train_bn) + + elif self.current_epoch == self.hparams.milestones[1]: + _unfreeze_and_add_param_group(module=self.feature_extractor[:-2], + optimizer=optimizer, + train_bn=self.hparams.train_bn) + + def training_step(self, batch, batch_idx): + + # 1. Forward pass: + x, y = batch + y_logits = self.forward(x) + y_true = y.view((-1, 1)).type_as(x) + y_bin = torch.ge(y_logits, 0) + + # 2. Compute loss & accuracy: + train_loss = self.loss(y_true, y_logits) + num_correct = torch.eq(y_bin.view(-1), y_true.view(-1)).sum() + + # 3. Outputs: + tqdm_dict = {'train_loss': train_loss} + output = OrderedDict({'loss': train_loss, + 'num_correct': num_correct, + 'log': tqdm_dict, + 'progress_bar': tqdm_dict}) + + return output + + def training_epoch_end(self, outputs): + """Compute and log training loss and accuracy at the epoch level.""" + + train_loss_mean = torch.stack([output['loss'] + for output in outputs]).mean() + train_acc_mean = torch.stack([output['num_correct'] + for output in outputs]).sum().float() + train_acc_mean /= (len(outputs) * self.hparams.batch_size) + return {'log': {'train_loss': train_loss_mean, + 'train_acc': train_acc_mean, + 'step': self.current_epoch}} + + def validation_step(self, batch, batch_idx): + + # 1. Forward pass: + x, y = batch + y_logits = self.forward(x) + y_true = y.view((-1, 1)).type_as(x) + y_bin = torch.ge(y_logits, 0) + + # 2. Compute loss & accuracy: + val_loss = self.loss(y_true, y_logits) + num_correct = torch.eq(y_bin.view(-1), y_true.view(-1)).sum() + + return {'val_loss': val_loss, + 'num_correct': num_correct} + + def validation_epoch_end(self, outputs): + """Compute and log validation loss and accuracy at the epoch level.""" + + val_loss_mean = torch.stack([output['val_loss'] + for output in outputs]).mean() + val_acc_mean = torch.stack([output['num_correct'] + for output in outputs]).sum().float() + val_acc_mean /= (len(outputs) * self.hparams.batch_size) + return {'log': {'val_loss': val_loss_mean, + 'val_acc': val_acc_mean, + 'step': self.current_epoch}} + + def configure_optimizers(self): + optimizer = optim.Adam(filter(lambda p: p.requires_grad, + self.parameters()), + lr=self.hparams.lr) + + scheduler = MultiStepLR(optimizer, + milestones=self.hparams.milestones, + gamma=self.hparams.lr_scheduler_gamma) + + return [optimizer], [scheduler] + + def prepare_data(self): + """Download images and prepare images datasets.""" + + # 1. Download the images + download_and_extract_archive(url=DATA_URL, + download_root=self.dl_path, + remove_finished=True) + + data_path = Path(self.dl_path).joinpath('cats_and_dogs_filtered') + + # 2. Load the data + preprocessing & data augmentation + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + + train_dataset = ImageFolder(root=data_path.joinpath('train'), + transform=transforms.Compose([ + transforms.Resize((224, 224)), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ])) + + valid_dataset = ImageFolder(root=data_path.joinpath('validation'), + transform=transforms.Compose([ + transforms.Resize((224, 224)), + transforms.ToTensor(), + normalize, + ])) + + self.train_dataset = train_dataset + self.valid_dataset = valid_dataset + + def __dataloader(self, train): + """Train/validation loaders.""" + + _dataset = self.train_dataset if train else self.valid_dataset + loader = DataLoader(dataset=_dataset, + batch_size=self.hparams.batch_size, + num_workers=self.hparams.num_workers, + shuffle=True if train else False) + + return loader + + def train_dataloader(self): + log.info('Training data loaded.') + return self.__dataloader(train=True) + + def val_dataloader(self): + log.info('Validation data loaded.') + return self.__dataloader(train=False) + + @staticmethod + def add_model_specific_args(parent_parser): + parser = argparse.ArgumentParser(parents=[parent_parser]) + parser.add_argument('--backbone', + default='resnet50', + type=str, + metavar='BK', + help='Name (as in ``torchvision.models``) of the feature extractor') + parser.add_argument('--epochs', + default=15, + type=int, + metavar='N', + help='total number of epochs', + dest='nb_epochs') + parser.add_argument('--batch-size', + default=8, + type=int, + metavar='B', + help='batch size', + dest='batch_size') + parser.add_argument('--gpus', + type=int, + default=1, + help='number of gpus to use') + parser.add_argument('--lr', + '--learning-rate', + default=1e-2, + type=float, + metavar='LR', + help='initial learning rate', + dest='lr') + parser.add_argument('--lr-scheduler-gamma', + default=1e-1, + type=float, + metavar='LRG', + help='Factor by which the learning rate is reduced at each milestone', + dest='lr_scheduler_gamma') + parser.add_argument('--num-workers', + default=6, + type=int, + metavar='W', + help='number of CPU workers', + dest='num_workers') + parser.add_argument('--train-bn', + default=True, + type=bool, + metavar='TB', + help='Whether the BatchNorm layers should be trainable', + dest='train_bn') + parser.add_argument('--milestones', + default=[5, 10], + type=list, + metavar='M', + help='List of two epochs milestones') + return parser + + +def main(hparams: argparse.Namespace) -> None: + """Train the model. + + Args: + hparams: Model hyper-parameters + + Note: + For the sake of the example, the images dataset will be downloaded + to a temporary directory. + """ + + with TemporaryDirectory(dir=hparams.root_data_path) as tmp_dir: + + model = TransferLearningModel(hparams, dl_path=tmp_dir) + + trainer = pl.Trainer( + weights_summary=None, + show_progress_bar=True, + num_sanity_val_steps=0, + gpus=hparams.gpus, + min_epochs=hparams.nb_epochs, + max_epochs=hparams.nb_epochs) + + trainer.fit(model) + + +def get_args() -> argparse.Namespace: + parent_parser = argparse.ArgumentParser(add_help=False) + parent_parser.add_argument('--root-data-path', + metavar='DIR', + type=str, + default=Path.cwd().as_posix(), + help='Root directory where to download the data', + dest='root_data_path') + parser = TransferLearningModel.add_model_specific_args(parent_parser) + return parser.parse_args() + + +if __name__ == '__main__': + + main(get_args()) From e6b34ef90d5f3eac70154b305b476614a64f1981 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 2 May 2020 17:01:44 +0200 Subject: [PATCH 19/43] [WIP] Reduction when batch size < num gpus (#1609) * reduce if <= num_gpus * add test with explanation * chlog * fix changelog Co-authored-by: J. Borovec --- CHANGELOG.md | 2 ++ pytorch_lightning/trainer/logging.py | 8 ++--- tests/trainer/test_dataloaders.py | 45 ++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ef025c6a856a6..5457a6e980318 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added transfer learning example (for a binary classification task in computer vision) ([#1564](https://github.com/PyTorchLightning/pytorch-lightning/pull/1564)) ### Changed + +- Reduction when `batch_size < num_gpus` ([#1609](https://github.com/PyTorchLightning/pytorch-lightning/pull/1609)) ### Deprecated diff --git a/pytorch_lightning/trainer/logging.py b/pytorch_lightning/trainer/logging.py index c1d598dc71875..978ac5df78d81 100644 --- a/pytorch_lightning/trainer/logging.py +++ b/pytorch_lightning/trainer/logging.py @@ -196,8 +196,8 @@ def reduce_distributed_output(self, output, num_gpus): elif isinstance(output[k], torch.Tensor) and output[k].dim() == 0: pass - # reduce only metrics that have the same number of gpus - elif output[k].size(0) == num_gpus: - reduced = torch.mean(output[k]) - output[k] = reduced + # do not reduce metrics that have batch size > num gpus + elif output[k].size(0) <= num_gpus: + output[k] = torch.mean(output[k]) + return output diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 0d528474d38c2..b6f6262ee90e1 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -2,6 +2,8 @@ import pytest import torch +from torch.utils.data.dataloader import DataLoader +from torch.utils.data.dataset import Subset import tests.base.utils as tutils from pytorch_lightning import Trainer @@ -482,3 +484,46 @@ class CustomDummyObj: assert isinstance(result, torch.utils.data.DataLoader) assert isinstance(result, CustomDataLoader) assert hasattr(result, 'dummy_kwarg') + + +@pytest.mark.skipif(torch.cuda.device_count() < 3, reason='Test requires multiple GPUs') +def test_batch_size_smaller_than_num_gpus(): + # we need at least 3 gpus for this test + num_gpus = 3 + batch_size = 3 + + class CurrentTestModel( + LightTrainDataloader, + TestModelBase, + ): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.c_d1_bn = torch.nn.ReLU() + + def train_dataloader(self): + dataloader = super().train_dataloader() + # construct a dataset with a size that is not divisible by num_gpus + # therefore the last batch will have a size < num_gpus + size = num_gpus * batch_size + (num_gpus - 1) + dataset = Subset(dataloader.dataset, range(size)) + dataloader = DataLoader( + dataset, + batch_size=self.hparams.batch_size, + drop_last=False, + ) + return dataloader + + hparams = tutils.get_default_hparams() + hparams.batch_size = batch_size + model = CurrentTestModel(hparams) + + trainer = Trainer( + max_epochs=1, + gpus=num_gpus, + ) + + # we expect the reduction for the metrics also to happen on the last batch + # where we will get fewer metrics than gpus + result = trainer.fit(model) + assert 1 == result From 595ec65796fa6cbb6ed9479e0871be3d6b208241 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 4 May 2020 07:25:48 +0200 Subject: [PATCH 20/43] refactor trainer checks (#1651) * refactor trainer checks * opt * none * Apply suggestions from code review * imports * fix tensors --- tests/base/eval_model_valid_epoch_ends.py | 9 +-- tests/trainer/test_checks.py | 83 +++++++---------------- 2 files changed, 31 insertions(+), 61 deletions(-) diff --git a/tests/base/eval_model_valid_epoch_ends.py b/tests/base/eval_model_valid_epoch_ends.py index 89bdd5198ef61..ab14ed10ef5ab 100644 --- a/tests/base/eval_model_valid_epoch_ends.py +++ b/tests/base/eval_model_valid_epoch_ends.py @@ -17,8 +17,8 @@ def validation_epoch_end(self, outputs): # if returned a scalar from validation_step, outputs is a list of tensor scalars # we return just the average in this case (if we want) # return torch.stack(outputs).mean() - val_loss_mean = 0 - val_acc_mean = 0 + val_loss_mean = torch.stack([x['val_loss'] for x in outputs]).mean() + val_acc_mean = torch.stack([x['val_acc'] for x in outputs]).mean() for output in outputs: val_loss = self.get_output_metric(output, 'val_loss') @@ -34,8 +34,9 @@ def validation_epoch_end(self, outputs): val_acc_mean += val_acc - val_loss_mean /= len(outputs) - val_acc_mean /= len(outputs) + if outputs: # skip zero divisions + val_loss_mean /= len(outputs) + val_acc_mean /= len(outputs) metrics_dict = {'val_loss': val_loss_mean.item(), 'val_acc': val_acc_mean.item()} results = {'progress_bar': metrics_dict, 'log': metrics_dict} diff --git a/tests/trainer/test_checks.py b/tests/trainer/test_checks.py index 7a68b2ab13bca..45155d67e65d7 100755 --- a/tests/trainer/test_checks.py +++ b/tests/trainer/test_checks.py @@ -1,106 +1,75 @@ import pytest import tests.base.utils as tutils -from pytorch_lightning import Trainer, LightningModule +from pytorch_lightning import Trainer from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base import EvalModelTemplate -from tests.base import ( - TestModelBase, - LightValidationDataloader, - LightValidationStepMixin, - LightValStepFitSingleDataloaderMixin, - LightTrainDataloader, -) +# TODO: add matching messages -def test_error_on_no_train_step(tmpdir): - """ Test that an error is thrown when no `training_step()` is defined """ - - class CurrentTestModel(LightningModule): - def forward(self, x): - pass +def test_wrong_train_setting(tmpdir): + """ + * Test that an error is thrown when no `training_dataloader()` is defined + * Test that an error is thrown when no `training_step()` is defined + """ + tutils.reset_seed() + hparams = tutils.get_default_hparams() trainer = Trainer(default_root_dir=tmpdir, max_epochs=1) with pytest.raises(MisconfigurationException): - model = CurrentTestModel() + model = EvalModelTemplate(hparams) + model.train_dataloader = None trainer.fit(model) - -def test_error_on_no_train_dataloader(tmpdir): - """ Test that an error is thrown when no `training_dataloader()` is defined """ - hparams = tutils.get_default_hparams() - - class CurrentTestModel(TestModelBase): - pass - - trainer = Trainer(default_root_dir=tmpdir, max_epochs=1) - with pytest.raises(MisconfigurationException): - model = CurrentTestModel(hparams) + model = EvalModelTemplate(hparams) + model.training_step = None trainer.fit(model) -def test_error_on_no_configure_optimizers(tmpdir): +def test_wrong_configure_optimizers(tmpdir): """ Test that an error is thrown when no `configure_optimizers()` is defined """ - - class CurrentTestModel(LightTrainDataloader, LightningModule): - def forward(self, x): - pass - - def training_step(self, batch, batch_idx, optimizer_idx=None): - pass - + tutils.reset_seed() trainer = Trainer(default_root_dir=tmpdir, max_epochs=1) with pytest.raises(MisconfigurationException): - model = CurrentTestModel() + model = EvalModelTemplate(tutils.get_default_hparams()) + model.configure_optimizers = None trainer.fit(model) -def test_warning_on_wrong_validation_settings(tmpdir): +def test_wrong_validation_settings(tmpdir): """ Test the following cases related to validation configuration of model: * error if `val_dataloader()` is overriden but `validation_step()` is not * if both `val_dataloader()` and `validation_step()` is overriden, throw warning if `val_epoch_end()` is not defined * error if `validation_step()` is overriden but `val_dataloader()` is not """ + tutils.reset_seed() hparams = tutils.get_default_hparams() - trainer = Trainer(default_root_dir=tmpdir, max_epochs=1) - class CurrentTestModel(LightTrainDataloader, - LightValidationDataloader, - TestModelBase): - pass - # check val_dataloader -> val_step with pytest.raises(MisconfigurationException): - model = CurrentTestModel(hparams) + model = EvalModelTemplate(hparams) + model.validation_step = None trainer.fit(model) - class CurrentTestModel(LightTrainDataloader, - LightValidationStepMixin, - TestModelBase): - pass - # check val_dataloader + val_step -> val_epoch_end with pytest.warns(RuntimeWarning): - model = CurrentTestModel(hparams) + model = EvalModelTemplate(hparams) + model.validation_epoch_end = None trainer.fit(model) - class CurrentTestModel(LightTrainDataloader, - LightValStepFitSingleDataloaderMixin, - TestModelBase): - pass - # check val_step -> val_dataloader with pytest.raises(MisconfigurationException): - model = CurrentTestModel(hparams) + model = EvalModelTemplate(hparams) + model.val_dataloader = None trainer.fit(model) -def test_warning_on_wrong_test_settigs(tmpdir): +def test_wrong_test_settigs(tmpdir): """ Test the following cases related to test configuration of model: * error if `test_dataloader()` is overriden but `test_step()` is not * if both `test_dataloader()` and `test_step()` is overriden, From 281a73ccf7a22cdf004755f1f7b4aead40b12d84 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 4 May 2020 13:13:11 +0200 Subject: [PATCH 21/43] specify cache matrix (#1725) --- .github/workflows/ci-testing.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml index fad7434e66a02..e9343b6b9ce71 100644 --- a/.github/workflows/ci-testing.yml +++ b/.github/workflows/ci-testing.yml @@ -71,9 +71,9 @@ jobs: uses: actions/cache@v1 with: path: ${{ steps.pip-cache.outputs.dir }} - key: ${{ runner.os }}-${{ matrix.python-version }}-pip-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-extra.txt') }} + key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}-pip-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-extra.txt') }} restore-keys: | - ${{ runner.os }}-${{ matrix.python-version }}-pip- + ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}-pip- - name: Install dependencies run: | @@ -99,14 +99,14 @@ jobs: run: | # tox --sitepackages # flake8 . - coverage run --source pytorch_lightning -m py.test pytorch_lightning tests -v --doctest-modules --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}.xml + coverage run --source pytorch_lightning -m py.test pytorch_lightning tests -v --doctest-modules --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml coverage report - name: Upload pytest test results uses: actions/upload-artifact@master with: - name: pytest-results-${{ runner.os }}-${{ matrix.python-version }} - path: junit/test-results-${{ runner.os }}-${{ matrix.python-version }}.xml + name: pytest-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }} + path: junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml # Use always() to always run this step to publish test results when there are test failures if: always() From 0cd5e64701148585b7957cd62d6cf764b2d0185e Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 4 May 2020 13:13:52 +0200 Subject: [PATCH 22/43] Tests: refactor loggers (#1689) * refactor default model * drop redundant seeds * path * refactor loggers tests * imports --- tests/loggers/test_base.py | 6 +++--- tests/loggers/test_neptune.py | 4 ++-- tests/loggers/test_trains.py | 5 ++--- tests/loggers/test_wandb.py | 1 - 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/tests/loggers/test_base.py b/tests/loggers/test_base.py index 1a52dadf9c621..595ca0ab09396 100644 --- a/tests/loggers/test_base.py +++ b/tests/loggers/test_base.py @@ -7,7 +7,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.loggers import LightningLoggerBase, LoggerCollection from pytorch_lightning.utilities import rank_zero_only -from tests.base import LightningTestModel, EvalModelTemplate +from tests.base import EvalModelTemplate def test_logger_collection(): @@ -61,7 +61,7 @@ def version(self): def test_custom_logger(tmpdir): hparams = tutils.get_default_hparams() - model = LightningTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) logger = CustomLogger() @@ -80,7 +80,7 @@ def test_custom_logger(tmpdir): def test_multiple_loggers(tmpdir): hparams = tutils.get_default_hparams() - model = LightningTestModel(hparams) + model = EvalModelTemplate(hparams) logger1 = CustomLogger() logger2 = CustomLogger() diff --git a/tests/loggers/test_neptune.py b/tests/loggers/test_neptune.py index 11961234ed41b..2ca3eaf513da7 100644 --- a/tests/loggers/test_neptune.py +++ b/tests/loggers/test_neptune.py @@ -5,7 +5,7 @@ import tests.base.utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.loggers import NeptuneLogger -from tests.base import LightningTestModel +from tests.base import EvalModelTemplate @patch('pytorch_lightning.loggers.neptune.neptune') @@ -61,7 +61,7 @@ def test_neptune_additional_methods(neptune): def test_neptune_leave_open_experiment_after_fit(tmpdir): """Verify that neptune experiment was closed after training""" - model = LightningTestModel(tutils.get_default_hparams()) + model = EvalModelTemplate(tutils.get_default_hparams()) def _run_training(logger): logger._experiment = MagicMock() diff --git a/tests/loggers/test_trains.py b/tests/loggers/test_trains.py index 305d0707079b0..738a0d9bcf867 100644 --- a/tests/loggers/test_trains.py +++ b/tests/loggers/test_trains.py @@ -3,13 +3,12 @@ import tests.base.utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.loggers import TrainsLogger -from tests.base import LightningTestModel +from tests.base import EvalModelTemplate def test_trains_logger(tmpdir): """Verify that basic functionality of TRAINS logger works.""" - hparams = tutils.get_default_hparams() - model = LightningTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) TrainsLogger.set_bypass_mode(True) TrainsLogger.set_credentials(api_host='http://integration.trains.allegro.ai:8008', files_host='http://integration.trains.allegro.ai:8081', diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py index 3a63fcb9da57e..4cd0eff431adc 100644 --- a/tests/loggers/test_wandb.py +++ b/tests/loggers/test_wandb.py @@ -2,7 +2,6 @@ import pickle from unittest.mock import patch -import tests.base.utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.loggers import WandbLogger From d28b1453934b9facbe6d6613f943087d2eac35bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 4 May 2020 14:24:34 +0200 Subject: [PATCH 23/43] Update type hints for multiple dataloaders in .fit() and .test() (#1723) * update typehints * change log --- CHANGELOG.md | 2 ++ pytorch_lightning/trainer/trainer.py | 8 ++++++-- tests/trainer/test_dataloaders.py | 6 +++--- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5457a6e980318..66bf99ba78a43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added transfer learning example (for a binary classification task in computer vision) ([#1564](https://github.com/PyTorchLightning/pytorch-lightning/pull/1564)) +- Added type hints in `Trainer.fit()` and `Trainer.test()` to reflect that also a list of dataloaders can be passed in ([#1723](https://github.com/PyTorchLightning/pytorch-lightning/pull/1723)). + ### Changed - Reduction when `batch_size < num_gpus` ([#1609](https://github.com/PyTorchLightning/pytorch-lightning/pull/1609)) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 7bfd97bfb83f1..23c40bfb50a78 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -672,7 +672,7 @@ def fit( self, model: LightningModule, train_dataloader: Optional[DataLoader] = None, - val_dataloaders: Optional[DataLoader] = None + val_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None ): r""" Runs the full optimization routine. @@ -913,7 +913,11 @@ def run_pretrain_routine(self, model: LightningModule): # CORE TRAINING LOOP self.train() - def test(self, model: Optional[LightningModule] = None, test_dataloaders: Optional[DataLoader] = None): + def test( + self, + model: Optional[LightningModule] = None, + test_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None + ): r""" Separates from fit to make sure you never run on your test set until you want to. diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index b6f6262ee90e1..d847b6c8730c2 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -113,7 +113,7 @@ class CurrentTestModel( trainer.fit(model) trainer.test() - # verify there are 2 val loaders + # verify there are 2 test loaders assert len(trainer.test_dataloaders) == 2, \ 'Multiple test_dataloaders not initiated properly' @@ -125,7 +125,7 @@ class CurrentTestModel( trainer.test() -def test_train_dataloaders_passed_to_fit(tmpdir): +def test_train_dataloader_passed_to_fit(tmpdir): """Verify that train dataloader can be passed to fit """ class CurrentTestModel(LightTrainDataloader, TestModelBase): @@ -175,7 +175,7 @@ class CurrentTestModel( def test_all_dataloaders_passed_to_fit(tmpdir): - """Verify train, val & test dataloader can be passed to fit """ + """Verify train, val & test dataloader(s) can be passed to fit and test method""" class CurrentTestModel( LightTrainDataloader, From 1077159834199a9fc06d6c4f21f551a180c3e75a Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 4 May 2020 17:38:08 +0200 Subject: [PATCH 24/43] Tests: refactor models (#1691) * refactor default model * drop redundant seeds * drop redundant seeds * refactor models tests * refactor models tests * imports * fix conf * Apply suggestions from code review --- pytorch_lightning/core/lightning.py | 4 +- tests/callbacks/test_callbacks.py | 3 +- tests/models/test_amp.py | 6 +- tests/models/test_cpu.py | 26 ++------- tests/models/test_gpu.py | 6 +- tests/models/test_hooks.py | 16 +----- tests/models/test_horovod.py | 5 +- tests/models/test_restore.py | 87 ++++++++++++++++------------- 8 files changed, 67 insertions(+), 86 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 2f1de6412f0f0..a534929434a8b 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1535,8 +1535,8 @@ def _load_model_state(cls, checkpoint: Dict[str, Any], *args, **kwargs) -> 'Ligh hparams = Namespace(**ckpt_hparams) if is_namespace else ckpt_hparams else: rank_zero_warn( - f"Checkpoint does not contain hyperparameters but {cls.__name__}'s __init__ " - f"contains argument 'hparams'. Will pass in an empty Namespace instead." + f"Checkpoint does not contain hyperparameters but {cls.__name__}'s __init__" + " contains argument 'hparams'. Will pass in an empty Namespace instead." " Did you forget to store your model hyperparameters in self.hparams?" ) hparams = Namespace() diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index 2bbcfaea1f191..b1fb71dc8e978 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -249,8 +249,7 @@ def test_pickling(tmpdir): @pytest.mark.parametrize('save_top_k', [-1, 0, 1, 2]) def test_model_checkpoint_with_non_string_input(tmpdir, save_top_k): - """ Test that None in checkpoint callback is valid and that chkp_path is - set correctly """ + """ Test that None in checkpoint callback is valid and that chkp_path is set correctly """ tutils.reset_seed() class CurrentTestModel(LightTrainDataloader, TestModelBase): diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py index f4f1d9c20a6e9..52fb90f135bae 100644 --- a/tests/models/test_amp.py +++ b/tests/models/test_amp.py @@ -6,7 +6,7 @@ import tests.base.utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.base import LightningTestModel, EvalModelTemplate +from tests.base import EvalModelTemplate @pytest.mark.spawn @@ -15,7 +15,6 @@ def test_amp_single_gpu(tmpdir, backend): """Make sure DP/DDP + AMP work.""" tutils.reset_seed() - trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, @@ -63,8 +62,7 @@ def test_amp_gpu_ddp_slurm_managed(tmpdir): tutils.set_random_master_port() os.environ['SLURM_LOCALID'] = str(0) - hparams = tutils.get_default_hparams() - model = LightningTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) # exp file to get meta logger = tutils.get_default_logger(tmpdir) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 46d1ba6e44aaf..13120c01756c1 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -7,16 +7,8 @@ import tests.base.utils as tutils from pytorch_lightning import Trainer -from pytorch_lightning.callbacks import ( - EarlyStopping, -) -from tests.base import ( - TestModelBase, - LightTrainDataloader, - LightningTestModel, - LightTestMixin, - EvalModelTemplate, -) +from pytorch_lightning.callbacks import EarlyStopping +from tests.base import EvalModelTemplate def test_early_stopping_cpu_model(tmpdir): @@ -106,8 +98,7 @@ def test_default_logger_callbacks_cpu_model(tmpdir): def test_running_test_after_fitting(tmpdir): """Verify test() on fitted model.""" - hparams = tutils.get_default_hparams() - model = LightningTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) # logger file to get meta logger = tutils.get_default_logger(tmpdir) @@ -138,11 +129,7 @@ def test_running_test_after_fitting(tmpdir): def test_running_test_no_val(tmpdir): """Verify `test()` works on a model with no `val_loader`.""" - class CurrentTestModel(LightTrainDataloader, LightTestMixin, TestModelBase): - pass - - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) # logger file to get meta logger = tutils.get_default_logger(tmpdir) @@ -220,8 +207,7 @@ def test_single_gpu_batch_parse(): def test_simple_cpu(tmpdir): """Verify continue training session on CPU.""" - hparams = tutils.get_default_hparams() - model = LightningTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) # fit model trainer = Trainer( @@ -285,7 +271,7 @@ def __getitem__(self, i): def __len__(self): return 1 - class BpttTestModel(LightTrainDataloader, TestModelBase): + class BpttTestModel(EvalModelTemplate): def __init__(self, hparams): super().__init__(hparams) self.test_hidden = None diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index dbaf4db8f8ed2..5bdb603e14518 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -9,7 +9,7 @@ from pytorch_lightning.core import memory from pytorch_lightning.trainer.distrib_parts import parse_gpu_ids, determine_root_gpu_device from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.base import LightningTestModel, EvalModelTemplate +from tests.base import EvalModelTemplate PRETEND_N_OF_GPUS = 16 @@ -65,7 +65,7 @@ def test_ddp_all_dataloaders_passed_to_fit(tmpdir): def test_cpu_slurm_save_load(tmpdir): """Verify model save/load/checkpoint on CPU.""" hparams = tutils.get_default_hparams() - model = LightningTestModel(hparams) + model = EvalModelTemplate(hparams) # logger file to get meta logger = tutils.get_default_logger(tmpdir) @@ -112,7 +112,7 @@ def test_cpu_slurm_save_load(tmpdir): logger=logger, checkpoint_callback=ModelCheckpoint(tmpdir), ) - model = LightningTestModel(hparams) + model = EvalModelTemplate(hparams) # set the epoch start hook so we can predict before the model does the full training def assert_pred_same(): diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 1d0e55df409e0..00147ef2bc089 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -2,29 +2,19 @@ import tests.base.utils as tutils from pytorch_lightning import Trainer -from tests.base import ( - LightTrainDataloader, - LightValidationMixin, - TestModelBase, - LightTestMixin) +from tests.base import EvalModelTemplate @pytest.mark.parametrize('max_steps', [1, 2, 3]) def test_on_before_zero_grad_called(max_steps): - class CurrentTestModel( - LightTrainDataloader, - LightValidationMixin, - LightTestMixin, - TestModelBase, - ): + class CurrentTestModel(EvalModelTemplate): on_before_zero_grad_called = 0 def on_before_zero_grad(self, optimizer): self.on_before_zero_grad_called += 1 - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = CurrentTestModel(tutils.get_default_hparams()) trainer = Trainer( max_steps=max_steps, diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 0f41dee6e4fb0..14644aee6649d 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -11,7 +11,7 @@ from pytorch_lightning import Trainer import tests.base.utils as tutils -from tests.base import LightningTestModel +from tests.base import EvalModelTemplate from tests.base.models import TestGAN try: @@ -107,7 +107,8 @@ def test_horovod_multi_gpu(tmpdir): @pytest.mark.skipif(not _nccl_available(), reason="test requires Horovod with NCCL support") @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") def test_horovod_transfer_batch_to_gpu(tmpdir): - class TestTrainingStepModel(LightningTestModel): + + class TestTrainingStepModel(EvalModelTemplate): def training_step(self, batch, *args, **kwargs): x, y = batch assert str(x.device) != 'cpu' diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py index af0165d498ab0..0a927a3a94e0a 100644 --- a/tests/models/test_restore.py +++ b/tests/models/test_restore.py @@ -9,11 +9,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.base import ( - LightningTestModel, - LightningTestModelWithoutHyperparametersArg, - LightningTestModelWithUnusedHyperparametersArg -) +from tests.base import EvalModelTemplate @pytest.mark.spawn @@ -23,8 +19,7 @@ def test_running_test_pretrained_model_distrib(tmpdir, backend): """Verify `test()` on pretrained model.""" tutils.set_random_master_port() - hparams = tutils.get_default_hparams() - model = LightningTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) # exp file to get meta logger = tutils.get_default_logger(tmpdir) @@ -53,7 +48,7 @@ def test_running_test_pretrained_model_distrib(tmpdir, backend): assert result == 1, 'training failed to complete' pretrained_model = tutils.load_model(logger, trainer.checkpoint_callback.dirpath, - module_class=LightningTestModel) + module_class=EvalModelTemplate) # run test set new_trainer = Trainer(**trainer_options) @@ -72,8 +67,7 @@ def test_running_test_pretrained_model_distrib(tmpdir, backend): def test_running_test_pretrained_model_cpu(tmpdir): """Verify test() on pretrained model.""" - hparams = tutils.get_default_hparams() - model = LightningTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) # logger file to get meta logger = tutils.get_default_logger(tmpdir) @@ -97,7 +91,7 @@ def test_running_test_pretrained_model_cpu(tmpdir): # correct result and ok accuracy assert result == 1, 'training failed to complete' pretrained_model = tutils.load_model( - logger, trainer.checkpoint_callback.dirpath, module_class=LightningTestModel + logger, trainer.checkpoint_callback.dirpath, module_class=EvalModelTemplate ) new_trainer = Trainer(**trainer_options) @@ -110,7 +104,7 @@ def test_running_test_pretrained_model_cpu(tmpdir): def test_load_model_from_checkpoint(tmpdir): """Verify test() on pretrained model.""" hparams = tutils.get_default_hparams() - model = LightningTestModel(hparams) + model = EvalModelTemplate(hparams) trainer_options = dict( progress_bar_refresh_rate=0, @@ -131,7 +125,7 @@ def test_load_model_from_checkpoint(tmpdir): # load last checkpoint last_checkpoint = sorted(glob.glob(os.path.join(trainer.checkpoint_callback.dirpath, "*.ckpt")))[-1] - pretrained_model = LightningTestModel.load_from_checkpoint(last_checkpoint) + pretrained_model = EvalModelTemplate.load_from_checkpoint(last_checkpoint) # test that hparams loaded correctly for k, v in vars(hparams).items(): @@ -152,7 +146,13 @@ def test_load_model_from_checkpoint(tmpdir): def test_dp_resume(tmpdir): """Make sure DP continues training correctly.""" hparams = tutils.get_default_hparams() - model = LightningTestModel(hparams) + model = EvalModelTemplate(hparams) + + trainer_options = dict( + max_epochs=1, + gpus=2, + distributed_backend='dp', + ) # get logger logger = tutils.get_default_logger(tmpdir) @@ -161,13 +161,9 @@ def test_dp_resume(tmpdir): # logger file to get weights checkpoint = tutils.init_checkpoint_callback(logger) - trainer_options = dict( - max_epochs=1, - gpus=2, - distributed_backend='dp', - logger=logger, - checkpoint_callback=checkpoint, - ) + # add these to the trainer options + trainer_options['logger'] = logger + trainer_options['checkpoint_callback'] = checkpoint # fit model trainer = Trainer(**trainer_options) @@ -188,13 +184,11 @@ def test_dp_resume(tmpdir): # init new trainer new_logger = tutils.get_default_logger(tmpdir, version=logger.version) - trainer_options.update( - logger=new_logger, - checkpoint_callback=ModelCheckpoint(tmpdir), - train_percent_check=0.5, - val_percent_check=0.2, - max_epochs=1, - ) + trainer_options['logger'] = new_logger + trainer_options['checkpoint_callback'] = ModelCheckpoint(tmpdir) + trainer_options['train_percent_check'] = 0.5 + trainer_options['val_percent_check'] = 0.2 + trainer_options['max_epochs'] = 1 new_trainer = Trainer(**trainer_options) # set the epoch start hook so we can predict before the model does the full training @@ -210,7 +204,7 @@ def assert_good_acc(): tutils.run_prediction(dataloader, dp_model, dp=True) # new model - model = LightningTestModel(hparams) + model = EvalModelTemplate(hparams) model.on_train_start = assert_good_acc # fit new model which should load hpc weights @@ -223,18 +217,19 @@ def assert_good_acc(): def test_model_saving_loading(tmpdir): """Tests use case where trainer saves the model, and user loads it from tags independently.""" - hparams = tutils.get_default_hparams() - model = LightningTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) # logger file to get meta logger = tutils.get_default_logger(tmpdir) - # fit model - trainer = Trainer( + trainer_options = dict( max_epochs=1, logger=logger, checkpoint_callback=ModelCheckpoint(tmpdir) ) + + # fit model + trainer = Trainer(**trainer_options) result = trainer.fit(model) # traning complete @@ -263,7 +258,7 @@ def test_model_saving_loading(tmpdir): # load new model tags_path = tutils.get_data_path(logger, path_dir=tmpdir) tags_path = os.path.join(tags_path, 'meta_tags.csv') - model_2 = LightningTestModel.load_from_checkpoint( + model_2 = EvalModelTemplate.load_from_checkpoint( checkpoint_path=new_weights_path, tags_csv=tags_path ) @@ -276,8 +271,7 @@ def test_model_saving_loading(tmpdir): def test_load_model_with_missing_hparams(tmpdir): - # fit model - trainer = Trainer( + trainer_options = dict( progress_bar_refresh_rate=0, max_epochs=1, checkpoint_callback=ModelCheckpoint(tmpdir, save_top_k=-1), @@ -285,22 +279,35 @@ def test_load_model_with_missing_hparams(tmpdir): default_root_dir=tmpdir, ) - model = LightningTestModelWithoutHyperparametersArg() + # fit model + trainer = Trainer(**trainer_options) + + class CurrentModelWithoutHparams(EvalModelTemplate): + def __init__(self): + hparams = tutils.get_default_hparams() + super().__init__(hparams) + + class CurrentModelUnusedHparams(EvalModelTemplate): + def __init__(self, hparams): + hparams = tutils.get_default_hparams() + super().__init__(hparams) + + model = CurrentModelWithoutHparams() trainer.fit(model) last_checkpoint = sorted(glob.glob(os.path.join(trainer.checkpoint_callback.dirpath, "*.ckpt")))[-1] # try to load a checkpoint that has hparams but model is missing hparams arg with pytest.raises(MisconfigurationException, match=r".*__init__ is missing the argument 'hparams'.*"): - LightningTestModelWithoutHyperparametersArg.load_from_checkpoint(last_checkpoint) + CurrentModelWithoutHparams.load_from_checkpoint(last_checkpoint) # create a checkpoint without hyperparameters # if the model does not take a hparams argument, it should not throw an error ckpt = torch.load(last_checkpoint) del(ckpt['hparams']) torch.save(ckpt, last_checkpoint) - LightningTestModelWithoutHyperparametersArg.load_from_checkpoint(last_checkpoint) + CurrentModelWithoutHparams.load_from_checkpoint(last_checkpoint) # load checkpoint without hparams again # warn if user's model has hparams argument with pytest.warns(UserWarning, match=r".*Will pass in an empty Namespace instead."): - LightningTestModelWithUnusedHyperparametersArg.load_from_checkpoint(last_checkpoint) + CurrentModelUnusedHparams.load_from_checkpoint(last_checkpoint) From e865b046b1dceb582f40a17baf335ba2523dd658 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Mon, 4 May 2020 17:38:51 +0200 Subject: [PATCH 25/43] Bugfix/lr finder (#1676) * fix early stopping bug * allow val dataloader * update CHANGELOG.md * fix early stopping bug * allow val dataloader * update CHANGELOG.md Co-authored-by: Nicki Skafte --- CHANGELOG.md | 5 ++++- pytorch_lightning/trainer/lr_finder.py | 21 +++++++++++++++------ tests/trainer/test_lr_finder.py | 4 ++-- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 66bf99ba78a43..ad5ddbe22e49a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Reduction when `batch_size < num_gpus` ([#1609](https://github.com/PyTorchLightning/pytorch-lightning/pull/1609)) +- Updated LightningTemplateModel to look more like Colab example ([#1577](https://github.com/PyTorchLightning/pytorch-lightning/pull/1577)) + ### Deprecated ### Removed @@ -32,6 +34,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed wandb logger `global_step` affects other loggers ([#1492](https://github.com/PyTorchLightning/pytorch-lightning/issues/1485)) +- Fixed bugs that prevent lr finder to be used together with early stopping and validation dataloaders ([#1676](https://github.com/PyTorchLightning/pytorch-lightning/pull/1676)) + ## [0.7.5] - 2020-04-27 ### Changed @@ -76,7 +80,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Defines shared proc. rank, remove rank from instances (e.g. loggers) ([#1408](https://github.com/PyTorchLightning/pytorch-lightning/pull/1408)) - Updated semantic segmentation example with custom U-Net and logging ([#1371](https://github.com/PyTorchLightning/pytorch-lightning/pull/1371)) - Disabled val and test shuffling ([#1600](https://github.com/PyTorchLightning/pytorch-lightning/pull/1600)) -- Updated LightningTemplateModel to look more like Colab example ([#1546](https://github.com/PyTorchLightning/pytorch-lightning/pull/1577)) ### Deprecated diff --git a/pytorch_lightning/trainer/lr_finder.py b/pytorch_lightning/trainer/lr_finder.py index b0491c818dd5b..e664fd3cc47d0 100755 --- a/pytorch_lightning/trainer/lr_finder.py +++ b/pytorch_lightning/trainer/lr_finder.py @@ -53,6 +53,7 @@ def _run_lr_finder_internally(self, model: LightningModule): def lr_find(self, model: LightningModule, train_dataloader: Optional[DataLoader] = None, + val_dataloaders: Optional[DataLoader] = None, min_lr: float = 1e-8, max_lr: float = 1, num_training: int = 100, @@ -105,7 +106,7 @@ def lr_find(self, """ save_path = os.path.join(self.default_root_dir, 'lr_find_temp.ckpt') - self._dump_params(model) + self._lr_finder_dump_params(model) # Prevent going into infinite loop self.auto_lr_find = False @@ -129,8 +130,10 @@ def lr_find(self, # Accumulation of gradients self.accumulate_grad_batches = num_accumulation_steps - # Disable standard checkpoint + # Disable standard checkpoint & early stopping self.checkpoint_callback = False + self.early_stop_callback = None + self.enable_early_stop = False # Required for saving the model self.optimizers, self.schedulers = [], [], @@ -150,7 +153,9 @@ def lr_find(self, model.configure_optimizers = lr_finder._get_new_optimizer(optimizers[0]) # Fit, lr & loss logged in callback - self.fit(model, train_dataloader=train_dataloader) + self.fit(model, + train_dataloader=train_dataloader, + val_dataloaders=val_dataloaders) # Prompt if we stopped early if self.global_step != num_training: @@ -165,13 +170,13 @@ def lr_find(self, os.remove(save_path) # Finish by resetting variables so trainer is ready to fit model - self._restore_params(model) + self._lr_finder_restore_params(model) if self.progress_bar_callback: self.progress_bar_callback.enable() return lr_finder - def _dump_params(self, model): + def _lr_finder_dump_params(self, model): # Prevent going into infinite loop self._params = { 'auto_lr_find': self.auto_lr_find, @@ -181,11 +186,13 @@ def _dump_params(self, model): 'progress_bar_refresh_rate': self.progress_bar_refresh_rate, 'accumulate_grad_batches': self.accumulate_grad_batches, 'checkpoint_callback': self.checkpoint_callback, + 'early_stop_callback': self.early_stop_callback, + 'enable_early_stop': self.enable_early_stop, 'progress_bar_callback': self.progress_bar_callback, 'configure_optimizers': model.configure_optimizers, } - def _restore_params(self, model): + def _lr_finder_restore_params(self, model): self.auto_lr_find = self._params['auto_lr_find'] self.logger = self._params['logger'] self.callbacks = self._params['callbacks'] @@ -193,6 +200,8 @@ def _restore_params(self, model): self.progress_bar_refresh_rate = self._params['progress_bar_refresh_rate'] self.accumulate_grad_batches = self._params['accumulate_grad_batches'] self.checkpoint_callback = self._params['checkpoint_callback'] + self.early_stop_callback = self._params['early_stop_callback'] + self.enable_early_stop = self._params['enable_early_stop'] self.progress_bar_callback = self._params['progress_bar_callback'] model.configure_optimizers = self._params['configure_optimizers'] diff --git a/tests/trainer/test_lr_finder.py b/tests/trainer/test_lr_finder.py index ba6e9c336b130..ea2eca3d712ad 100755 --- a/tests/trainer/test_lr_finder.py +++ b/tests/trainer/test_lr_finder.py @@ -82,8 +82,8 @@ class CurrentTestModel( ) changed_attributes = ['callbacks', 'logger', 'max_steps', 'auto_lr_find', - 'progress_bar_refresh_rate', - 'accumulate_grad_batches', + 'progress_bar_refresh_rate', 'early_stop_callback', + 'accumulate_grad_batches', 'enable_early_stop', 'checkpoint_callback'] attributes_before = {} for ca in changed_attributes: From 1a9f1c80a131cde9dea395081f16e0222df31593 Mon Sep 17 00:00:00 2001 From: Ryan Henderson Date: Mon, 4 May 2020 17:40:50 +0200 Subject: [PATCH 26/43] Fix example argument parser in docs (#1692) [`parser.parse_known_args()`](https://docs.python.org/3.7/library/argparse.html#argparse.ArgumentParser.parse_known_args) actually returns a tuple of the Namespace of known args and a list of unknown args. We only want the former. --- docs/source/hyperparameters.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/hyperparameters.rst b/docs/source/hyperparameters.rst index ff067b82e0ae3..a1364b5084156 100644 --- a/docs/source/hyperparameters.rst +++ b/docs/source/hyperparameters.rst @@ -213,7 +213,7 @@ Now we can allow each model to inject the arguments it needs in the main.py parser.add_argument('--model_name', type=str, default='gan', help='gan or mnist') # THIS LINE IS KEY TO PULL THE MODEL NAME - temp_args = parser.parse_known_args() + temp_args, _ = parser.parse_known_args() # let the model add what it wants if temp_args.model_name == 'gan': From f90afa29b884307755d05afd5856a2187f03c8c7 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 4 May 2020 10:02:57 -0700 Subject: [PATCH 27/43] Fix disabling progress bar on non-zero ranks using Horovod backend (#1709) * Fix Horovod backend to disable progress bar on all ranks except 0 * Add join barriers * Added changelog * Make protected and add verbosity * Refactor to disable progress bar callback in train * Removed vebose setting * Add cache check for Horovod * Test run again * Updated comment * Always skip cache for Horovod * Only reinstall when necessary * Added separate step * Fixed spacing * Skip Python 3.8 --- .github/workflows/ci-testing.yml | 11 +++++++++++ CHANGELOG.md | 2 ++ pytorch_lightning/trainer/distrib_parts.py | 5 +++-- pytorch_lightning/trainer/trainer.py | 11 +++++++++++ pytorch_lightning/trainer/training_io.py | 12 ++++++++++++ 5 files changed, 39 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml index e9343b6b9ce71..ac24dcee0a1e1 100644 --- a/.github/workflows/ci-testing.yml +++ b/.github/workflows/ci-testing.yml @@ -86,6 +86,17 @@ jobs: pip list shell: bash + - name: Reinstall Horovod if necessary + if: runner.os != 'windows' && matrix.python-version != '3.8' + run: | + HOROVOD_BUILT=$(python -c "import horovod.torch; horovod.torch.nccl_built(); print('SUCCESS')") + if [[ $HOROVOD_BUILT != "SUCCESS" ]]; then + pip uninstall -y horovod + HOROVOD_BUILD_ARCH_FLAGS="-mfma" pip install --no-cache-dir $(grep "horovod" requirements-extra.txt) + fi + horovodrun --check-build + shell: bash + - name: Cache datasets uses: actions/cache@v1 with: diff --git a/CHANGELOG.md b/CHANGELOG.md index ad5ddbe22e49a..bda687d2bbfca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed wandb logger `global_step` affects other loggers ([#1492](https://github.com/PyTorchLightning/pytorch-lightning/issues/1485)) +- Fixed disabling progress bar on non-zero ranks using Horovod backend ([#1709](https://github.com/PyTorchLightning/pytorch-lightning/pull/1709)) + - Fixed bugs that prevent lr finder to be used together with early stopping and validation dataloaders ([#1676](https://github.com/PyTorchLightning/pytorch-lightning/pull/1676)) ## [0.7.5] - 2020-04-27 diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index a9f4b6114522e..bcd0c0724ee7c 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -576,8 +576,9 @@ def horovod_train(self, model): torch.cuda.set_device(self.root_gpu) model.cuda(self.root_gpu) - # Only show progress bar from the first worker - self.progress_bar_refresh_rate = self.progress_bar_refresh_rate if hvd.rank() == 0 else 0 + # avoid duplicating progress bar + if hvd.rank() != 0 and self.progress_bar_callback is not None: + self.progress_bar_callback.disable() # CHOOSE OPTIMIZER # allow for lr schedulers as well diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 23c40bfb50a78..0353fae2bff7f 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -51,6 +51,13 @@ else: XLA_AVAILABLE = True +try: + import horovod.torch as hvd +except ImportError: + HOROVOD_AVAILABLE = False +else: + HOROVOD_AVAILABLE = True + class Trainer( TrainerIOMixin, @@ -853,6 +860,10 @@ def run_pretrain_routine(self, model: LightningModule): # wait for all processes to catch up torch_xla.core.xla_model.rendezvous("pl.Trainer.run_pretrain_routine") + elif self.use_horovod: + # wait for all processes to catch up + hvd.join() + # register auto-resubmit when on SLURM self.register_slurm_signal_handlers() diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py index 78d24fad0a18f..4f474b761e94f 100644 --- a/pytorch_lightning/trainer/training_io.py +++ b/pytorch_lightning/trainer/training_io.py @@ -112,6 +112,13 @@ else: XLA_AVAILABLE = True +try: + import horovod.torch as hvd +except ImportError: + HOROVOD_AVAILABLE = False +else: + HOROVOD_AVAILABLE = True + class TrainerIOMixin(ABC): @@ -123,6 +130,7 @@ class TrainerIOMixin(ABC): resume_from_checkpoint: ... use_ddp: bool use_ddp2: bool + use_horovod: bool checkpoint_callback: ... proc_rank: int weights_save_path: str @@ -175,6 +183,10 @@ def restore_weights(self, model: LightningModule): # wait for all processes to catch up torch_xla.core.xla_model.rendezvous("pl.TrainerIOMixin.restore_weights") + elif self.use_horovod: + # wait for all processes to catch up + hvd.join() + # clear cache after restore if self.on_gpu: torch.cuda.empty_cache() From 6d58fb1353b04b85beb4c71b61e8b317bc456276 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 4 May 2020 22:51:39 +0200 Subject: [PATCH 28/43] Tests: refactor trainer (#1728) * lr * optim * wip * wip * fix mean * flake8 --- tests/base/eval_model_optimizers.py | 2 +- tests/base/eval_model_template.py | 4 +- tests/base/eval_model_test_dataloaders.py | 3 + tests/base/eval_model_train_steps.py | 14 +++ tests/base/eval_model_valid_dataloaders.py | 4 + tests/base/eval_model_valid_epoch_ends.py | 8 +- tests/base/models.py | 1 + tests/trainer/test_lr_finder.py | 62 ++--------- tests/trainer/test_optimizers.py | 74 ++++--------- tests/trainer/test_trainer.py | 122 +++++++-------------- 10 files changed, 103 insertions(+), 191 deletions(-) diff --git a/tests/base/eval_model_optimizers.py b/tests/base/eval_model_optimizers.py index bcce319d4a565..2fd9b104a06d9 100644 --- a/tests/base/eval_model_optimizers.py +++ b/tests/base/eval_model_optimizers.py @@ -12,7 +12,7 @@ def configure_optimizers(self): optimizer = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) return optimizer - def configure_optimizers_empty(self): + def configure_optimizers__empty(self): return None def configure_optimizers__lbfgs(self): diff --git a/tests/base/eval_model_template.py b/tests/base/eval_model_template.py index 37f4dfbd04144..d97e8a925fc6d 100644 --- a/tests/base/eval_model_template.py +++ b/tests/base/eval_model_template.py @@ -1,3 +1,5 @@ +from argparse import Namespace + import torch import torch.nn as nn import torch.nn.functional as F @@ -37,7 +39,7 @@ def __init__(self, hparams: object) -> object: """Pass in parsed HyperOptArgumentParser to the model.""" # init superclass super().__init__() - self.hparams = hparams + self.hparams = Namespace(**hparams) if isinstance(hparams, dict) else hparams # if you specify an example input, the summary will show input/output for each layer self.example_input_array = torch.rand(5, 28 * 28) diff --git a/tests/base/eval_model_test_dataloaders.py b/tests/base/eval_model_test_dataloaders.py index ecbfe19142eda..158b398545588 100644 --- a/tests/base/eval_model_test_dataloaders.py +++ b/tests/base/eval_model_test_dataloaders.py @@ -9,3 +9,6 @@ def dataloader(self, train: bool): def test_dataloader(self): return self.dataloader(train=False) + + def test_dataloader__empty(self): + return None diff --git a/tests/base/eval_model_train_steps.py b/tests/base/eval_model_train_steps.py index f9d0663de64ec..8a4307555dccb 100644 --- a/tests/base/eval_model_train_steps.py +++ b/tests/base/eval_model_train_steps.py @@ -1,11 +1,16 @@ +import math from abc import ABC from collections import OrderedDict +import torch + class TrainingStepVariations(ABC): """ Houses all variations of training steps """ + test_step_inf_loss = float('inf') + def training_step(self, batch, batch_idx, optimizer_idx=None): """Lightning calls this inside the training loop""" # forward pass @@ -28,3 +33,12 @@ def training_step(self, batch, batch_idx, optimizer_idx=None): if self.trainer.batch_idx % 2 == 0: return loss_val + + def training_step__inf_loss(self, batch, batch_idx, optimizer_idx=None): + output = self.training_step(batch, batch_idx, optimizer_idx) + if batch_idx == self.test_step_inf_loss: + if isinstance(output, dict): + output['loss'] *= torch.tensor(math.inf) # make loss infinite + else: + output /= 0 + return output diff --git a/tests/base/eval_model_valid_dataloaders.py b/tests/base/eval_model_valid_dataloaders.py index 2d6f2bf2af5d9..72b5afcceee0e 100644 --- a/tests/base/eval_model_valid_dataloaders.py +++ b/tests/base/eval_model_valid_dataloaders.py @@ -9,3 +9,7 @@ def dataloader(self, train: bool): def val_dataloader(self): return self.dataloader(train=False) + + def val_dataloader__multiple(self): + return [self.dataloader(train=False), + self.dataloader(train=False)] diff --git a/tests/base/eval_model_valid_epoch_ends.py b/tests/base/eval_model_valid_epoch_ends.py index ab14ed10ef5ab..73866451023f5 100644 --- a/tests/base/eval_model_valid_epoch_ends.py +++ b/tests/base/eval_model_valid_epoch_ends.py @@ -16,9 +16,13 @@ def validation_epoch_end(self, outputs): """ # if returned a scalar from validation_step, outputs is a list of tensor scalars # we return just the average in this case (if we want) + def _mean(res, key): + # recursive mean for multilevel dicts + return torch.stack([x[key] if isinstance(x, dict) else _mean(x, key) for x in res]).mean() + # return torch.stack(outputs).mean() - val_loss_mean = torch.stack([x['val_loss'] for x in outputs]).mean() - val_acc_mean = torch.stack([x['val_acc'] for x in outputs]).mean() + val_loss_mean = _mean(outputs, 'val_loss') + val_acc_mean = _mean(outputs, 'val_acc') for output in outputs: val_loss = self.get_output_metric(output, 'val_loss') diff --git a/tests/base/models.py b/tests/base/models.py index ebc6d755761c8..4d39c5150b035 100644 --- a/tests/base/models.py +++ b/tests/base/models.py @@ -8,6 +8,7 @@ from torch import optim from torch.utils.data import DataLoader +from tests.base import EvalModelTemplate from tests.base.datasets import TrialMNIST try: diff --git a/tests/trainer/test_lr_finder.py b/tests/trainer/test_lr_finder.py index ea2eca3d712ad..ce9d3d3b1b0f3 100755 --- a/tests/trainer/test_lr_finder.py +++ b/tests/trainer/test_lr_finder.py @@ -4,25 +4,14 @@ import tests.base.utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.base import ( - LightTrainDataloader, - TestModelBase, - LightTestMultipleOptimizersWithSchedulingMixin, -) +from tests.base import EvalModelTemplate def test_error_on_more_than_1_optimizer(tmpdir): """ Check that error is thrown when more than 1 optimizer is passed """ - class CurrentTestModel( - LightTestMultipleOptimizersWithSchedulingMixin, - LightTrainDataloader, - TestModelBase, - ): - pass - - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) + model.configure_optimizers = model.configure_optimizers__multiple_schedulers # logger file to get meta trainer = Trainer( @@ -37,14 +26,7 @@ class CurrentTestModel( def test_model_reset_correctly(tmpdir): """ Check that model weights are correctly reset after lr_find() """ - class CurrentTestModel( - LightTrainDataloader, - TestModelBase, - ): - pass - - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) # logger file to get meta trainer = Trainer( @@ -66,14 +48,7 @@ class CurrentTestModel( def test_trainer_reset_correctly(tmpdir): """ Check that all trainer parameters are reset correctly after lr_find() """ - class CurrentTestModel( - LightTrainDataloader, - TestModelBase, - ): - pass - - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) # logger file to get meta trainer = Trainer( @@ -102,15 +77,10 @@ class CurrentTestModel( def test_trainer_arg_bool(tmpdir): - class CurrentTestModel( - LightTrainDataloader, - TestModelBase, - ): - pass - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(hparams) before_lr = hparams.learning_rate + # logger file to get meta trainer = Trainer( default_save_path=tmpdir, @@ -126,15 +96,10 @@ class CurrentTestModel( def test_trainer_arg_str(tmpdir): - class CurrentTestModel( - LightTrainDataloader, - TestModelBase, - ): - pass - hparams = tutils.get_default_hparams() hparams.__dict__['my_fancy_lr'] = 1.0 # update with non-standard field - model = CurrentTestModel(hparams) + model = EvalModelTemplate(hparams) + before_lr = hparams.my_fancy_lr # logger file to get meta trainer = Trainer( @@ -151,14 +116,9 @@ class CurrentTestModel( def test_call_to_trainer_method(tmpdir): - class CurrentTestModel( - LightTrainDataloader, - TestModelBase, - ): - pass - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(hparams) + before_lr = hparams.learning_rate # logger file to get meta trainer = Trainer( diff --git a/tests/trainer/test_optimizers.py b/tests/trainer/test_optimizers.py index be0ac5471d24c..665ba3cdfbc69 100644 --- a/tests/trainer/test_optimizers.py +++ b/tests/trainer/test_optimizers.py @@ -3,30 +3,15 @@ import tests.base.utils as tutils from pytorch_lightning import Trainer -from tests.base import ( - TestModelBase, - LightTrainDataloader, - LightValidationStepMixin, - LightValidationMixin, - LightTestOptimizerWithSchedulingMixin, - LightTestMultipleOptimizersWithSchedulingMixin, - LightTestOptimizersWithMixedSchedulingMixin, - LightTestReduceLROnPlateauMixin, - LightTestNoneOptimizerMixin, EvalModelTemplate -) +from tests.base import EvalModelTemplate def test_optimizer_with_scheduling(tmpdir): """ Verify that learning rate scheduling is working """ - class CurrentTestModel( - LightTestOptimizerWithSchedulingMixin, - LightTrainDataloader, - TestModelBase): - pass - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(hparams) + model.configure_optimizers = model.configure_optimizers__single_scheduler # fit model trainer = Trainer( @@ -36,6 +21,7 @@ class CurrentTestModel( train_percent_check=0.2 ) results = trainer.fit(model) + assert results == 1 init_lr = hparams.learning_rate adjusted_lr = [pg['lr'] for pg in trainer.optimizers[0].param_groups] @@ -54,14 +40,9 @@ class CurrentTestModel( def test_multi_optimizer_with_scheduling(tmpdir): """ Verify that learning rate scheduling is working """ - class CurrentTestModel( - LightTestMultipleOptimizersWithSchedulingMixin, - LightTrainDataloader, - TestModelBase): - pass - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(hparams) + model.configure_optimizers = model.configure_optimizers__multiple_schedulers # fit model trainer = Trainer( @@ -71,6 +52,7 @@ class CurrentTestModel( train_percent_check=0.2 ) results = trainer.fit(model) + assert results == 1 init_lr = hparams.learning_rate adjusted_lr1 = [pg['lr'] for pg in trainer.optimizers[0].param_groups] @@ -93,14 +75,9 @@ class CurrentTestModel( def test_multi_optimizer_with_scheduling_stepping(tmpdir): - class CurrentTestModel( - LightTestOptimizersWithMixedSchedulingMixin, - LightTrainDataloader, - TestModelBase): - pass - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(hparams) + model.configure_optimizers = model.configure_optimizers__multiple_schedulers # fit model trainer = Trainer( @@ -110,6 +87,7 @@ class CurrentTestModel( train_percent_check=0.2 ) results = trainer.fit(model) + assert results == 1 init_lr = hparams.learning_rate adjusted_lr1 = [pg['lr'] for pg in trainer.optimizers[0].param_groups] @@ -127,7 +105,7 @@ class CurrentTestModel( adjusted_lr2 = adjusted_lr2[0] # Called ones after end of epoch - assert init_lr * 0.1 ** 0 == adjusted_lr1, \ + assert init_lr * 0.1 ** 1 == adjusted_lr1, \ 'lr for optimizer 1 not adjusted correctly' # Called every 3 steps, meaning for 1 epoch of 11 batches, it is called 3 times assert init_lr * 0.1 == adjusted_lr2, \ @@ -136,16 +114,9 @@ class CurrentTestModel( def test_reduce_lr_on_plateau_scheduling(tmpdir): - class CurrentTestModel( - LightTestReduceLROnPlateauMixin, - LightTrainDataloader, - LightValidationMixin, - LightValidationStepMixin, - TestModelBase): - pass - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(hparams) + model.configure_optimizers = model.configure_optimizers__reduce_lr_on_plateau # fit model trainer = Trainer( @@ -155,7 +126,7 @@ class CurrentTestModel( train_percent_check=0.2 ) results = trainer.fit(model) - assert results + assert results == 1 assert trainer.lr_schedulers[0] == \ dict(scheduler=trainer.lr_schedulers[0]['scheduler'], monitor='val_loss', @@ -233,14 +204,9 @@ def test_none_optimizer_warning(): def test_none_optimizer(tmpdir): - class CurrentTestModel( - LightTestNoneOptimizerMixin, - LightTrainDataloader, - TestModelBase): - pass - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(hparams) + model.configure_optimizers = model.configure_optimizers__empty # fit model trainer = Trainer( @@ -256,11 +222,9 @@ class CurrentTestModel( def test_configure_optimizer_from_dict(tmpdir): - """Tests if `configure_optimizer` method could return a dictionary with - `optimizer` field only. - """ + """Tests if `configure_optimizer` method could return a dictionary with `optimizer` field only.""" - class CurrentTestModel(LightTrainDataloader, TestModelBase): + class CurrentModel(EvalModelTemplate): def configure_optimizers(self): config = { 'optimizer': torch.optim.SGD(params=self.parameters(), lr=1e-03) @@ -268,7 +232,7 @@ def configure_optimizers(self): return config hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = CurrentModel(hparams) # fit model trainer = Trainer(default_save_path=tmpdir, max_epochs=1) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index d72c04ed80e36..8ab722d8886b2 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -8,34 +8,24 @@ import torch import tests.base.utils as tutils -from pytorch_lightning import Callback +from pytorch_lightning import Callback, LightningModule from pytorch_lightning import Trainer from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint from pytorch_lightning.core.lightning import load_hparams_from_tags_csv from pytorch_lightning.trainer.logging import TrainerLoggingMixin from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.base import ( - TestModelBase, - DictHparamsModel, - LightningTestModel, - LightEmptyTestStep, - LightValidationStepMixin, - LightValidationMultipleDataloadersMixin, - LightTrainDataloader, - LightTestDataloader, - LightValidationMixin, EvalModelTemplate, -) +from tests.base import EvalModelTemplate def test_model_pickle(tmpdir): import pickle - model = TestModelBase(tutils.get_default_hparams()) + model = EvalModelTemplate(tutils.get_default_hparams()) pickle.dumps(model) def test_hparams_save_load(tmpdir): - model = DictHparamsModel({'in_features': 28 * 28, 'out_features': 10, 'failed_key': lambda x: x}) + model = EvalModelTemplate(vars(tutils.get_default_hparams())) trainer = Trainer( default_root_dir=tmpdir, @@ -48,19 +38,15 @@ def test_hparams_save_load(tmpdir): # try to load the model now pretrained_model = tutils.load_model_from_checkpoint( trainer.checkpoint_callback.dirpath, - module_class=DictHparamsModel + module_class=EvalModelTemplate ) + assert pretrained_model def test_no_val_module(tmpdir): """Tests use case where trainer saves the model, and user loads it from tags independently.""" - hparams = tutils.get_default_hparams() - - class CurrentTestModel(LightTrainDataloader, TestModelBase): - pass - - model = CurrentTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) # logger file to get meta logger = tutils.get_default_logger(tmpdir) @@ -84,7 +70,7 @@ class CurrentTestModel(LightTrainDataloader, TestModelBase): assert 'hparams' in ckpt.keys(), 'hparams missing from checkpoints' # won't load without hparams in the ckpt - model_2 = LightningTestModel.load_from_checkpoint( + model_2 = EvalModelTemplate.load_from_checkpoint( checkpoint_path=new_weights_path, ) model_2.eval() @@ -93,11 +79,7 @@ class CurrentTestModel(LightTrainDataloader, TestModelBase): def test_no_val_end_module(tmpdir): """Tests use case where trainer saves the model, and user loads it from tags independently.""" - class CurrentTestModel(LightTrainDataloader, LightValidationStepMixin, TestModelBase): - pass - - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) # logger file to get meta logger = tutils.get_default_logger(tmpdir) @@ -120,7 +102,7 @@ class CurrentTestModel(LightTrainDataloader, LightValidationStepMixin, TestModel # load new model tags_path = tutils.get_data_path(logger, path_dir=tmpdir) tags_path = os.path.join(tags_path, 'meta_tags.csv') - model_2 = LightningTestModel.load_from_checkpoint( + model_2 = EvalModelTemplate.load_from_checkpoint( checkpoint_path=new_weights_path, tags_csv=tags_path ) @@ -185,8 +167,7 @@ def _optimizer_step(self, epoch, batch_idx, optimizer, # clear gradients optimizer.zero_grad() - hparams = tutils.get_default_hparams() - model = LightningTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) schedule = {1: 2, 3: 4} trainer = Trainer(accumulate_grad_batches=schedule, @@ -260,9 +241,6 @@ def test_model_checkpoint_options(tmpdir, save_top_k, file_prefix, expected_file def mock_save_function(filepath): open(filepath, 'a').close() - hparams = tutils.get_default_hparams() - _ = LightningTestModel(hparams) - # simulated losses losses = [10, 9, 2.8, 5, 2.5] @@ -288,8 +266,7 @@ def mock_save_function(filepath): def test_model_freeze_unfreeze(): - hparams = tutils.get_default_hparams() - model = LightningTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) model.freeze() model.unfreeze() @@ -302,7 +279,7 @@ def test_resume_from_checkpoint_epoch_restored(tmpdir): def _new_model(): # Create a model that tracks epochs and batches seen - model = LightningTestModel(hparams) + model = EvalModelTemplate(hparams) model.num_epochs_seen = 0 model.num_batches_seen = 0 model.num_on_load_checkpoint_called = 0 @@ -452,15 +429,8 @@ def test_trainer_min_steps_and_epochs(tmpdir): def test_benchmark_option(tmpdir): """Verify benchmark option.""" - class CurrentTestModel( - LightValidationMultipleDataloadersMixin, - LightTrainDataloader, - TestModelBase - ): - pass - - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) + model.val_dataloader = model.val_dataloader__multiple # verify torch.backends.cudnn.benchmark is not turned on assert not torch.backends.cudnn.benchmark @@ -481,40 +451,34 @@ class CurrentTestModel( def test_testpass_overrides(tmpdir): + # todo: check duplicated tests against trainer_checks hparams = tutils.get_default_hparams() - class LocalModel(LightTrainDataloader, TestModelBase): - pass - - class LocalModelNoEnd(LightTrainDataloader, LightTestDataloader, LightEmptyTestStep, TestModelBase): - pass - - class LocalModelNoStep(LightTrainDataloader, TestModelBase): - def test_epoch_end(self, outputs): - return {} - # Misconfig when neither test_step or test_end is implemented - with pytest.raises(MisconfigurationException): - model = LocalModel(hparams) + with pytest.raises(MisconfigurationException, match='.*not implement `test_dataloader`.*'): + model = EvalModelTemplate(hparams) + model.test_dataloader = model.test_dataloader__empty Trainer().test(model) # Misconfig when neither test_step or test_end is implemented with pytest.raises(MisconfigurationException): - model = LocalModelNoStep(hparams) + model = EvalModelTemplate(hparams) + model.test_step = LightningModule.test_step Trainer().test(model) # No exceptions when one or both of test_step or test_end are implemented - model = LocalModelNoEnd(hparams) + model = EvalModelTemplate(hparams) + model.test_step_end = LightningModule.test_step_end Trainer().test(model) - model = LightningTestModel(hparams) + model = EvalModelTemplate(hparams) Trainer().test(model) def test_disabled_validation(): """Verify that `val_percent_check=0` disables the validation loop unless `fast_dev_run=True`.""" - class CurrentModel(LightTrainDataloader, LightValidationMixin, TestModelBase): + class CurrentModel(EvalModelTemplate): validation_step_invoked = False validation_epoch_end_invoked = False @@ -564,59 +528,56 @@ def validation_epoch_end(self, *args, **kwargs): def test_nan_loss_detection(tmpdir): - test_step = 8 - class InfLossModel(LightTrainDataloader, TestModelBase): + class CurrentModel(EvalModelTemplate): + test_batch_inf_loss = 8 - def training_step(self, batch, batch_idx): - output = super().training_step(batch, batch_idx) - if batch_idx == test_step: + def training_step(self, batch, batch_idx, optimizer_idx=None): + output = super().training_step(batch, batch_idx, optimizer_idx) + if batch_idx == self.test_batch_inf_loss: if isinstance(output, dict): output['loss'] *= torch.tensor(math.inf) # make loss infinite else: output /= 0 return output - hparams = tutils.get_default_hparams() - model = InfLossModel(hparams) + model = CurrentModel(tutils.get_default_hparams()) # fit model trainer = Trainer( default_root_dir=tmpdir, - max_steps=(test_step + 1), + max_steps=(model.test_batch_inf_loss + 1), terminate_on_nan=True ) with pytest.raises(ValueError, match=r'.*The loss returned in `training_step` is nan or inf.*'): trainer.fit(model) - assert trainer.global_step == test_step + assert trainer.global_step == model.test_step_inf_loss for param in model.parameters(): assert torch.isfinite(param).all() def test_nan_params_detection(tmpdir): - test_step = 8 - class NanParamModel(LightTrainDataloader, TestModelBase): + class CurrentModel(EvalModelTemplate): + test_batch_nan = 8 def on_after_backward(self): - if self.global_step == test_step: + if self.global_step == self.test_batch_nan: # simulate parameter that became nan torch.nn.init.constant_(self.c_d1.bias, math.nan) - hparams = tutils.get_default_hparams() - - model = NanParamModel(hparams) + model = CurrentModel(tutils.get_default_hparams()) trainer = Trainer( default_root_dir=tmpdir, - max_steps=(test_step + 1), + max_steps=(model.test_batch_nan + 1), terminate_on_nan=True ) with pytest.raises(ValueError, match=r'.*Detected nan and/or inf values in `c_d1.bias`.*'): trainer.fit(model) - assert trainer.global_step == test_step + assert trainer.global_step == model.test_batch_nan # after aborting the training loop, model still has nan-valued params params = torch.cat([param.view(-1) for param in model.parameters()]) @@ -626,7 +587,7 @@ def on_after_backward(self): def test_trainer_interrupted_flag(tmpdir): """Test the flag denoting that a user interrupted training.""" - model = DictHparamsModel({'in_features': 28 * 28, 'out_features': 10}) + model = EvalModelTemplate(tutils.get_default_hparams()) class InterruptCallback(Callback): def __init__(self): @@ -656,8 +617,7 @@ def test_gradient_clipping(tmpdir): Test gradient clipping """ - hparams = tutils.get_default_hparams() - model = LightningTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) # test that gradient is clipped correctly def _optimizer_step(*args, **kwargs): From 043ae697c2bea15e193e9731a982aae6fcc0fcb6 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 4 May 2020 22:52:22 +0200 Subject: [PATCH 29/43] Tests: refactor callbacks (#1688) * refactor default model * drop redundant seeds * path * refactor callback tests * update * fix sch * wip * fix return * review --- tests/callbacks/test_callbacks.py | 53 ++++++++-------------------- tests/callbacks/test_progress_bar.py | 40 +++------------------ tests/trainer/test_dataloaders.py | 1 - 3 files changed, 18 insertions(+), 76 deletions(-) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index b1fb71dc8e978..32fdcdaeb51a1 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -3,28 +3,14 @@ from pytorch_lightning import Callback from pytorch_lightning import Trainer, LightningModule from pytorch_lightning.callbacks import EarlyStopping, LearningRateLogger, ModelCheckpoint -from tests.base import ( - LightTrainDataloader, - LightTestMixin, - LightValidationMixin, - LightTestOptimizersWithMixedSchedulingMixin, - TestModelBase -) +from tests.base import EvalModelTemplate def test_trainer_callback_system(tmpdir): """Test the callback system.""" - class CurrentTestModel( - LightTrainDataloader, - LightTestMixin, - LightValidationMixin, - TestModelBase, - ): - pass - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(hparams) def _check_args(trainer, pl_module): assert isinstance(trainer, Trainer) @@ -214,18 +200,18 @@ def on_test_end(self, trainer, pl_module): def test_early_stopping_no_val_step(tmpdir): """Test that early stopping callback falls back to training metrics when no validation defined.""" - class ModelWithoutValStep(LightTrainDataloader, TestModelBase): + class CurrentModel(EvalModelTemplate): def training_step(self, *args, **kwargs): output = super().training_step(*args, **kwargs) - loss = output['loss'] # could be anything else - output.update({'my_train_metric': loss}) + output.update({'my_train_metric': output['loss']}) # could be anything else return output - model = ModelWithoutValStep(tutils.get_default_hparams()) + model = CurrentModel(tutils.get_default_hparams()) + model.validation_step = None + model.val_dataloader = None stopping = EarlyStopping(monitor='my_train_metric', min_delta=0.1) - trainer = Trainer( default_root_dir=tmpdir, early_stop_callback=stopping, @@ -251,12 +237,7 @@ def test_pickling(tmpdir): def test_model_checkpoint_with_non_string_input(tmpdir, save_top_k): """ Test that None in checkpoint callback is valid and that chkp_path is set correctly """ tutils.reset_seed() - - class CurrentTestModel(LightTrainDataloader, TestModelBase): - pass - - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) checkpoint = ModelCheckpoint(filepath=None, save_top_k=save_top_k) @@ -275,11 +256,8 @@ def test_lr_logger_single_lr(tmpdir): """ Test that learning rates are extracted and logged for single lr scheduler""" tutils.reset_seed() - class CurrentTestModel(LightTrainDataloader, TestModelBase): - pass - - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) + model.configure_optimizers = model.configure_optimizers__single_scheduler lr_logger = LearningRateLogger() trainer = Trainer( @@ -291,6 +269,7 @@ class CurrentTestModel(LightTrainDataloader, TestModelBase): ) results = trainer.fit(model) + assert results == 1 assert lr_logger.lrs, 'No learning rates logged' assert len(lr_logger.lrs) == len(trainer.lr_schedulers), \ 'Number of learning rates logged does not match number of lr schedulers' @@ -302,13 +281,8 @@ def test_lr_logger_multi_lrs(tmpdir): """ Test that learning rates are extracted and logged for multi lr schedulers """ tutils.reset_seed() - class CurrentTestModel(LightTestOptimizersWithMixedSchedulingMixin, - LightTrainDataloader, - TestModelBase): - pass - - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) + model.configure_optimizers = model.configure_optimizers__multiple_schedulers lr_logger = LearningRateLogger() trainer = Trainer( @@ -320,6 +294,7 @@ class CurrentTestModel(LightTestOptimizersWithMixedSchedulingMixin, ) results = trainer.fit(model) + assert results == 1 assert lr_logger.lrs, 'No learning rates logged' assert len(lr_logger.lrs) == len(trainer.lr_schedulers), \ 'Number of learning rates logged does not match number of lr schedulers' diff --git a/tests/callbacks/test_progress_bar.py b/tests/callbacks/test_progress_bar.py index 7cd5d5435adef..ebd35fedfa13d 100644 --- a/tests/callbacks/test_progress_bar.py +++ b/tests/callbacks/test_progress_bar.py @@ -4,12 +4,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ProgressBarBase, ProgressBar, ModelCheckpoint from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.base import ( - LightTrainDataloader, - LightTestMixin, - LightValidationMixin, - TestModelBase -) +from tests.base import EvalModelTemplate @pytest.mark.parametrize('callbacks,refresh_rate', [ @@ -63,16 +58,7 @@ def test_progress_bar_misconfiguration(): def test_progress_bar_totals(): """Test that the progress finishes with the correct total steps processed.""" - class CurrentTestModel( - LightTrainDataloader, - LightTestMixin, - LightValidationMixin, - TestModelBase, - ): - pass - - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) trainer = Trainer( progress_bar_refresh_rate=1, @@ -121,16 +107,7 @@ class CurrentTestModel( def test_progress_bar_fast_dev_run(): - class CurrentTestModel( - LightTrainDataloader, - LightTestMixin, - LightValidationMixin, - TestModelBase, - ): - pass - - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) trainer = Trainer( fast_dev_run=True, @@ -163,16 +140,7 @@ class CurrentTestModel( def test_progress_bar_progress_refresh(refresh_rate): """Test that the three progress bars get correctly updated when using different refresh rates.""" - class CurrentTestModel( - LightTrainDataloader, - LightTestMixin, - LightValidationMixin, - TestModelBase, - ): - pass - - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) class CurrentProgressBar(ProgressBar): diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index d847b6c8730c2..d0a6dd869a0ee 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -33,7 +33,6 @@ dict(val_check_interval=10000), ]) def test_dataloader_config_errors(tmpdir, dataloader_options): - tutils.reset_seed() class CurrentTestModel( LightTrainDataloader, From 48e808c20e55fae47fc58b3e2c4e2056883b88b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 4 May 2020 22:53:06 +0200 Subject: [PATCH 30/43] Move generated RST files to subfolder (#1555) * move generated files to subfolder * remove if exists * reformat argv Co-Authored-By: Jirka Borovec * update rebase * rebase yml Co-authored-by: Jirka Borovec Co-authored-by: J. Borovec --- .gitignore | 4 +-- docs/source/conf.py | 56 +++++++++++++++++++++-------------------- docs/source/index.rst | 14 +++++------ docs/source/modules.rst | 7 ------ 4 files changed, 37 insertions(+), 44 deletions(-) delete mode 100644 docs/source/modules.rst diff --git a/.gitignore b/.gitignore index d5bea7f6d58f4..cb8fd278c5c4f 100644 --- a/.gitignore +++ b/.gitignore @@ -13,9 +13,7 @@ test_tube_data/ test_tube_exp/ # Documentations -docs/source/pl_examples*.rst -docs/source/pytorch_lightning*.rst -docs/source/tests*.rst +docs/source/api docs/source/*.md # Byte-compiled / optimized / DLL files diff --git a/docs/source/conf.py b/docs/source/conf.py index 9a9948680b4c2..b41351f15015f 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -21,6 +21,7 @@ # import m2r import builtins import pt_lightning_sphinx_theme +from sphinx.ext import apidoc PATH_HERE = os.path.abspath(os.path.dirname(__file__)) PATH_ROOT = os.path.join(PATH_HERE, '..', '..') @@ -127,18 +128,18 @@ # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [ - 'pytorch_lightning.rst', - 'pl_examples.*', - 'modules.rst', + 'api/pytorch_lightning.rst', + 'api/pl_examples.*', + 'api/modules.rst', # deprecated/renamed: - 'pytorch_lightning.loggers.comet_logger.rst', # TODO: remove in v0.8.0 - 'pytorch_lightning.loggers.mlflow_logger.rst', # TODO: remove in v0.8.0 - 'pytorch_lightning.loggers.test_tube_logger.rst', # TODO: remove in v0.8.0 - 'pytorch_lightning.callbacks.pt_callbacks.*', # TODO: remove in v0.8.0 - 'pytorch_lightning.pt_overrides.*', # TODO: remove in v0.8.0 - 'pytorch_lightning.root_module.*', # TODO: remove in v0.8.0 - 'pytorch_lightning.logging.*', # TODO: remove in v0.8.0 + 'api/pytorch_lightning.loggers.comet_logger.rst', # TODO: remove in v0.8.0 + 'api/pytorch_lightning.loggers.mlflow_logger.rst', # TODO: remove in v0.8.0 + 'api/pytorch_lightning.loggers.test_tube_logger.rst', # TODO: remove in v0.8.0 + 'api/pytorch_lightning.callbacks.pt_callbacks.*', # TODO: remove in v0.8.0 + 'api/pytorch_lightning.pt_overrides.*', # TODO: remove in v0.8.0 + 'api/pytorch_lightning.root_module.*', # TODO: remove in v0.8.0 + 'api/pytorch_lightning.logging.*', # TODO: remove in v0.8.0 ] # The name of the Pygments (syntax highlighting) style to use. @@ -263,32 +264,33 @@ # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = True -# https://github.com/rtfd/readthedocs.org/issues/1139 -# I use sphinx-apidoc to auto-generate API documentation for my project. -# Right now I have to commit these auto-generated files to my repository -# so that RTD can build them into HTML docs. It'd be cool if RTD could run -# sphinx-apidoc for me, since it's easy to forget to regen API docs -# and commit them to my repo after making changes to my code. +# packages for which sphinx-apidoc should generate the docs (.rst files) PACKAGES = [ pytorch_lightning.__name__, 'pl_examples', ] +apidoc_output_folder = os.path.join(PATH_HERE, 'api') + def run_apidoc(_): + sys.path.insert(0, apidoc_output_folder) + + # delete api-doc files before generating them + if os.path.exists(apidoc_output_folder): + shutil.rmtree(apidoc_output_folder) + for pkg in PACKAGES: - argv = ['-e', '-o', PATH_HERE, os.path.join(PATH_HERE, PATH_ROOT, pkg), - '**/test_*', '--force', '--private', '--module-first'] - try: - # Sphinx 1.7+ - from sphinx.ext import apidoc - apidoc.main(argv) - except ImportError: - # Sphinx 1.6 (and earlier) - from sphinx import apidoc - argv.insert(0, apidoc.__file__) - apidoc.main(argv) + argv = ['-e', + '-o', apidoc_output_folder, + os.path.join(PATH_ROOT, pkg), + '**/test_*', + '--force', + '--private', + '--module-first'] + + apidoc.main(argv) def setup(app): diff --git a/docs/source/index.rst b/docs/source/index.rst index 6d1bfa26c2853..b74a9490af4e0 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -104,10 +104,10 @@ Indices and tables .. toctree:: :hidden: - pytorch_lightning.core - pytorch_lightning.callbacks - pytorch_lightning.loggers - pytorch_lightning.overrides - pytorch_lightning.profiler - pytorch_lightning.trainer - pytorch_lightning.utilities \ No newline at end of file + api/pytorch_lightning.core + api/pytorch_lightning.callbacks + api/pytorch_lightning.loggers + api/pytorch_lightning.overrides + api/pytorch_lightning.profiler + api/pytorch_lightning.trainer + api/pytorch_lightning.utilities \ No newline at end of file diff --git a/docs/source/modules.rst b/docs/source/modules.rst deleted file mode 100644 index e4c5121858c28..0000000000000 --- a/docs/source/modules.rst +++ /dev/null @@ -1,7 +0,0 @@ -pl_examples -=========== - -.. toctree:: - :maxdepth: 4 - - pl_examples From a6de1b8d75c67cdf18e3cc0a24a1f471d4069613 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 5 May 2020 04:16:54 +0200 Subject: [PATCH 31/43] doctest for .rst files (#1511) * add doctest to circleci * Revert "add doctest to circleci" This reverts commit c45b34ea911a81f87989f6c3a832b1e8d8c471c6. * Revert "Revert "add doctest to circleci"" This reverts commit 41fca97fdcfe1cf4f6bdb3bbba75d25fa3b11f70. * doctest docs rst files * Revert "doctest docs rst files" This reverts commit b4a2e83e3da5ed1909de500ec14b6b614527c07f. * doctest only rst * doctest debugging.rst * doctest apex * doctest callbacks * doctest early stopping * doctest for child modules * doctest experiment reporting * indentation * doctest fast training * doctest for hyperparams * doctests for lr_finder * doctests multi-gpu * more doctest * make doctest drone * fix label build error * update fast training * update invalid imports * fix problem with int device count * rebase stuff * wip * wip * wip * intro guide * add missing code block * circleci * logger import for doctest * test if doctest runs on drone * fix mnist download * also run install deps for building docs * install cmake * try sudo * hide output * try pip stuff * try to mock horovod * Tranfer -> Transfer * add torchvision to extras * revert pip stuff * mlflow file location * do not mock torch * torchvision * drone extra req. * try higher sphinx version * Revert "try higher sphinx version" This reverts commit 490ac28e46d6fd52352640dfdf0d765befa56988. * try coverage command * try coverage command * try undoc flag * newline * undo drone * report coverage * review Co-authored-by: Jirka Borovec * remove torchvision from extras * skip tests only if torchvision not available * fix testoutput torchvision Co-authored-by: Jirka Borovec --- .circleci/config.yml | 5 +- .drone.yml | 2 + docs/source/apex.rst | 9 +- docs/source/callbacks.rst | 37 ++- docs/source/child_modules.rst | 35 ++- docs/source/conf.py | 24 +- docs/source/debugging.rst | 26 +- docs/source/early_stopping.rst | 44 +-- docs/source/experiment_logging.rst | 211 +++++++------ docs/source/experiment_reporting.rst | 97 +++--- docs/source/fast_training.rst | 41 +-- docs/source/hyperparameters.rst | 101 +++--- docs/source/introduction_guide.rst | 442 ++++++++++++++------------- docs/source/lr_finder.rst | 20 +- docs/source/multi_gpu.rst | 57 ++-- docs/source/multiple_loaders.rst | 11 +- docs/source/new-project.rst | 36 ++- docs/source/optimizers.rst | 8 +- docs/source/sequences.rst | 17 +- docs/source/single_gpu.rst | 9 +- docs/source/slurm.rst | 120 ++++---- docs/source/test_set.rst | 9 +- docs/source/training_tricks.rst | 9 +- docs/source/transfer_learning.rst | 53 ++-- docs/source/weights_loading.rst | 36 ++- 25 files changed, 810 insertions(+), 649 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index e6eca78ea59b1..28de0a75bdd12 100755 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -64,10 +64,13 @@ references: name: Make Documentation command: | # sudo apt-get install pandoc + sudo apt-get update && sudo apt-get install -y cmake pip install -r requirements.txt --user sudo pip install -r docs/requirements.txt + pip install -r requirements-extra.txt --user # for doctesting loggers etc. # sphinx-apidoc -o ./docs/source ./pytorch_lightning **/test_* --force --follow-links - cd docs; make clean ; make html --debug --jobs 2 SPHINXOPTS="-W" + cd docs; make clean; make html --debug --jobs 2 SPHINXOPTS="-W" + make doctest; make coverage jobs: diff --git a/.drone.yml b/.drone.yml index 407ebd066cf9b..88e2d76a52503 100644 --- a/.drone.yml +++ b/.drone.yml @@ -35,9 +35,11 @@ steps: - apt-get update && apt-get install -y cmake - pip install -r requirements.txt --user -q - pip install -r ./tests/requirements-devel.txt --user -q + #- pip install -r ./docs/requirements.txt --user -q - pip list - python -c "import torch ; print(' & '.join([torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]) if torch.cuda.is_available() else 'only CPU')" - coverage run --source pytorch_lightning -m py.test pytorch_lightning tests benchmarks -v --doctest-modules # --flake8 + #- cd docs; make doctest; make coverage - coverage report - codecov --token $CODECOV_TOKEN # --pr $DRONE_PULL_REQUEST --build $DRONE_BUILD_NUMBER --branch $DRONE_BRANCH --commit $DRONE_COMMIT --tag $DRONE_TAG - python tests/collect_env_details.py diff --git a/docs/source/apex.rst b/docs/source/apex.rst index e1c7a1b2c8364..f705e040bd38f 100644 --- a/docs/source/apex.rst +++ b/docs/source/apex.rst @@ -1,3 +1,8 @@ +.. testsetup:: * + + from pytorch_lightning.trainer.trainer import Trainer + + 16-bit training ================= Lightning offers 16-bit training for CPUs, GPUs and TPUs. @@ -38,7 +43,7 @@ Install apex Enable 16-bit ^^^^^^^^^^^^^ -.. code-block:: python +.. testcode:: # turn on 16-bit trainer = Trainer(amp_level='O1', precision=16) @@ -50,7 +55,7 @@ TPU 16-bit ---------- 16-bit on TPus is much simpler. To use 16-bit with TPUs set precision to 16 when using the tpu flag -.. code-block:: python +.. testcode:: # DEFAULT trainer = Trainer(num_tpu_cores=8, precision=32) diff --git a/docs/source/callbacks.rst b/docs/source/callbacks.rst index a2969820b2eeb..744c1f0c5edd6 100644 --- a/docs/source/callbacks.rst +++ b/docs/source/callbacks.rst @@ -1,3 +1,8 @@ +.. testsetup:: * + + from pytorch_lightning.trainer.trainer import Trainer + from pytorch_lightning.callbacks.base import Callback + .. role:: hidden :class: hidden-section @@ -18,21 +23,23 @@ An overall Lightning system should have: Example: -.. doctest:: - - >>> import pytorch_lightning as pl - >>> class MyPrintingCallback(pl.Callback): - ... - ... def on_init_start(self, trainer): - ... print('Starting to init trainer!') - ... - ... def on_init_end(self, trainer): - ... print('trainer is init now') - ... - ... def on_train_end(self, trainer, pl_module): - ... print('do something when training ends') - ... - >>> trainer = pl.Trainer(callbacks=[MyPrintingCallback()]) +.. testcode:: + + class MyPrintingCallback(Callback): + + def on_init_start(self, trainer): + print('Starting to init trainer!') + + def on_init_end(self, trainer): + print('trainer is init now') + + def on_train_end(self, trainer, pl_module): + print('do something when training ends') + + trainer = Trainer(callbacks=[MyPrintingCallback()]) + +.. testoutput:: + Starting to init trainer! trainer is init now diff --git a/docs/source/child_modules.rst b/docs/source/child_modules.rst index 49fe6f463c373..4c2d60cc13246 100644 --- a/docs/source/child_modules.rst +++ b/docs/source/child_modules.rst @@ -1,3 +1,22 @@ +.. testsetup:: * + + import torch + from pytorch_lightning.trainer.trainer import Trainer + from pytorch_lightning.callbacks.base import Callback + from pytorch_lightning.core.lightning import LightningModule + + class LitMNIST(LightningModule): + + def __init__(self): + super().__init__() + + def train_dataloader(): + pass + + def val_dataloader(): + pass + + Child Modules ------------- Research projects tend to test different approaches to the same dataset. @@ -7,13 +26,18 @@ For example, imagine we now want to train an Autoencoder to use as a feature ext Recall that `LitMNIST` already defines all the dataloading etc... The only things that change in the `Autoencoder` model are the init, forward, training, validation and test step. -.. code-block:: python +.. testcode:: class Encoder(torch.nn.Module): - ... + pass + + class Decoder(torch.nn.Module): + pass class AutoEncoder(LitMNIST): + def __init__(self): + super().__init__() self.encoder = Encoder() self.decoder = Decoder() @@ -30,10 +54,10 @@ that change in the `Autoencoder` model are the init, forward, training, validati return loss def validation_step(self, batch, batch_idx): - return self._shared_eval(batch, batch_idx, 'val'): + return self._shared_eval(batch, batch_idx, 'val') def test_step(self, batch, batch_idx): - return self._shared_eval(batch, batch_idx, 'test'): + return self._shared_eval(batch, batch_idx, 'test') def _shared_eval(self, batch, batch_idx, prefix): x, y = batch @@ -43,6 +67,7 @@ that change in the `Autoencoder` model are the init, forward, training, validati loss = F.nll_loss(logits, y) return {f'{prefix}_loss': loss} + and we can train this using the same trainer .. code-block:: python @@ -58,5 +83,3 @@ In this case, we want to use the `AutoEncoder` to extract image representations some_images = torch.Tensor(32, 1, 28, 28) representations = autoencoder(some_images) - -.. \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index b41351f15015f..f6dad2c3922ea 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -309,7 +309,7 @@ def setup(app): # https://stackoverflow.com/questions/15889621/sphinx-how-to-exclude-imports-in-automodule MOCK_REQUIRE_PACKAGES = [] -with open(os.path.join(PATH_ROOT, 'requirements.txt'), 'r') as fp: +with open(os.path.join(PATH_ROOT, 'requirements-extra.txt'), 'r') as fp: for ln in fp.readlines(): found = [ln.index(ch) for ch in list(',=<>#') if ch in ln] pkg = ln[:min(found)] if found else ln @@ -318,19 +318,10 @@ def setup(app): # TODO: better parse from package since the import name and package name may differ MOCK_MANUAL_PACKAGES = [ - 'torch', 'torchvision', 'PIL', - 'test_tube', - 'mlflow', - 'comet_ml', - 'wandb', - 'neptune', - 'trains', ] autodoc_mock_imports = MOCK_REQUIRE_PACKAGES + MOCK_MANUAL_PACKAGES -# for mod_name in MOCK_REQUIRE_PACKAGES: -# sys.modules[mod_name] = mock.Mock() # Options for the linkcode extension @@ -405,3 +396,16 @@ def find_source(): # Useful for avoiding ambiguity when the same section heading appears in different documents. # http://www.sphinx-doc.org/en/master/usage/extensions/autosectionlabel.html autosectionlabel_prefix_document = True + +# only run doctests marked with a ".. doctest::" directive +doctest_test_doctest_blocks = '' +doctest_global_setup = """ + +import importlib +import os +import torch + +TORCHVISION_AVAILABLE = importlib.util.find_spec('torchvision') + +""" +coverage_skip_undoc_in_source = True diff --git a/docs/source/debugging.rst b/docs/source/debugging.rst index 775862d8c1826..412b6d613ecc6 100644 --- a/docs/source/debugging.rst +++ b/docs/source/debugging.rst @@ -1,3 +1,7 @@ +.. testsetup:: * + + from pytorch_lightning.trainer.trainer import Trainer + Debugging ========= The following are flags that make debugging much easier. @@ -11,9 +15,9 @@ a full epoch to crash. (See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.fast_dev_run` argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`) -.. code-block:: python +.. testcode:: - trainer = pl.Trainer(fast_dev_run=True) + trainer = Trainer(fast_dev_run=True) Inspect gradient norms ---------------------- @@ -22,10 +26,10 @@ Logs (to a logger), the norm of each weight matrix. (See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.track_grad_norm` argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`) -.. code-block:: python +.. testcode:: # the 2-norm - trainer = pl.Trainer(track_grad_norm=2) + trainer = Trainer(track_grad_norm=2) Log GPU usage ------------- @@ -34,9 +38,9 @@ Logs (to a logger) the GPU usage for each GPU on the master machine. (See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.log_gpu_memory` argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`) -.. code-block:: python +.. testcode:: - trainer = pl.Trainer(log_gpu_memory=True) + trainer = Trainer(log_gpu_memory=True) Make model overfit on subset of data ------------------------------------ @@ -47,9 +51,9 @@ and try to get your model to overfit. If it can't, it's a sign it won't work wit (See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.overfit_pct` argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`) -.. code-block:: python +.. testcode:: - trainer = pl.Trainer(overfit_pct=0.01) + trainer = Trainer(overfit_pct=0.01) Print the parameter count by layer ---------------------------------- @@ -59,9 +63,9 @@ To disable this behavior, turn off this flag: (See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.weights_summary` argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`) -.. code-block:: python +.. testcode:: - trainer = pl.Trainer(weights_summary=None) + trainer = Trainer(weights_summary=None) Set the number of validation sanity steps @@ -72,7 +76,7 @@ This avoids crashing in the validation loop sometime deep into a lengthy trainin (See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.num_sanity_val_steps` argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`) -.. code-block:: python +.. testcode:: # DEFAULT trainer = Trainer(num_sanity_val_steps=5) \ No newline at end of file diff --git a/docs/source/early_stopping.rst b/docs/source/early_stopping.rst index e74a720b30ebf..a0bfc83ec27d9 100644 --- a/docs/source/early_stopping.rst +++ b/docs/source/early_stopping.rst @@ -1,3 +1,9 @@ +.. testsetup:: * + + from pytorch_lightning.trainer.trainer import Trainer + from pytorch_lightning.callbacks.early_stopping import EarlyStopping + + Early stopping ============== @@ -17,23 +23,25 @@ Enable Early Stopping using Callbacks on epoch end -------------------------------------------------- There are two ways to enable early stopping using callbacks on epoch end. -.. doctest:: +- Set early_stop_callback to True. Will look for 'val_loss' in validation_epoch_end() return dict. + If it is not found an error is raised. + + .. testcode:: + + trainer = Trainer(early_stop_callback=True) + +- Or configure your own callback - >>> from pytorch_lightning import Trainer - >>> from pytorch_lightning.callbacks import EarlyStopping + .. testcode:: - # A) Set early_stop_callback to True. Will look for 'val_loss' - # in validation_epoch_end() return dict. If it is not found an error is raised. - >>> trainer = Trainer(early_stop_callback=True) - # B) Or configure your own callback - >>> early_stop_callback = EarlyStopping( - ... monitor='val_loss', - ... min_delta=0.00, - ... patience=3, - ... verbose=False, - ... mode='min' - ... ) - >>> trainer = Trainer(early_stop_callback=early_stop_callback) + early_stop_callback = EarlyStopping( + monitor='val_loss', + min_delta=0.00, + patience=3, + verbose=False, + mode='min' + ) + trainer = Trainer(early_stop_callback=early_stop_callback) In any case, the callback will fall back to the training metrics (returned in :meth:`~pytorch_lightning.core.lightning.LightningModule.training_step`, @@ -43,7 +51,8 @@ looking for a key to monitor if validation is disabled or is not defined. .. seealso:: - :class:`~pytorch_lightning.trainer.trainer.Trainer` + - :class:`~pytorch_lightning.trainer.trainer.Trainer` + - :class:`~pytorch_lightning.callbacks.early_stopping.EarlyStopping` Disable Early Stopping with callbacks on epoch end -------------------------------------------------- @@ -53,4 +62,5 @@ Note that ``None`` will not disable early stopping but will lead to the default behaviour. .. seealso:: - :class:`~pytorch_lightning.trainer.trainer.Trainer` + - :class:`~pytorch_lightning.trainer.trainer.Trainer` + - :class:`~pytorch_lightning.callbacks.early_stopping.EarlyStopping` diff --git a/docs/source/experiment_logging.rst b/docs/source/experiment_logging.rst index e9ddb47239b50..772efcfc13bc5 100644 --- a/docs/source/experiment_logging.rst +++ b/docs/source/experiment_logging.rst @@ -1,3 +1,9 @@ +.. testsetup:: * + + from pytorch_lightning.trainer.trainer import Trainer + from pytorch_lightning.core.lightning import LightningModule + + Experiment Logging ================== @@ -14,31 +20,29 @@ First, install the package: Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`: -.. doctest:: - - >>> import os - >>> from pytorch_lightning import Trainer - >>> from pytorch_lightning.loggers import CometLogger - >>> comet_logger = CometLogger( - ... api_key=os.environ.get('COMET_API_KEY'), - ... workspace=os.environ.get('COMET_WORKSPACE'), # Optional - ... save_dir='.', # Optional - ... project_name='default_project', # Optional - ... rest_api_key=os.environ.get('COMET_REST_API_KEY'), # Optional - ... experiment_name='default' # Optional - ... ) - >>> trainer = Trainer(logger=comet_logger) +.. testcode:: + + import os + from pytorch_lightning.loggers import CometLogger + comet_logger = CometLogger( + api_key=os.environ.get('COMET_API_KEY'), + workspace=os.environ.get('COMET_WORKSPACE'), # Optional + save_dir='.', # Optional + project_name='default_project', # Optional + rest_api_key=os.environ.get('COMET_REST_API_KEY'), # Optional + experiment_name='default' # Optional + ) + trainer = Trainer(logger=comet_logger) The :class:`~pytorch_lightning.loggers.CometLogger` is available anywhere except ``__init__`` in your :class:`~pytorch_lightning.core.lightning.LightningModule`. -.. doctest:: +.. testcode:: - >>> from pytorch_lightning import LightningModule - >>> class MyModule(LightningModule): - ... def any_lightning_module_function_or_hook(self): - ... some_img = fake_image() - ... self.logger.experiment.add_image('generated_images', some_img, 0) + class MyModule(LightningModule): + def any_lightning_module_function_or_hook(self): + some_img = fake_image() + self.logger.experiment.add_image('generated_images', some_img, 0) .. seealso:: :class:`~pytorch_lightning.loggers.CometLogger` docs. @@ -56,15 +60,14 @@ First, install the package: Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`: -.. doctest:: +.. testcode:: - >>> from pytorch_lightning import Trainer - >>> from pytorch_lightning.loggers import MLFlowLogger - >>> mlf_logger = MLFlowLogger( - ... experiment_name="default", - ... tracking_uri="file:/." - ... ) - >>> trainer = Trainer(logger=mlf_logger) + from pytorch_lightning.loggers import MLFlowLogger + mlf_logger = MLFlowLogger( + experiment_name="default", + tracking_uri="file:./ml-runs" + ) + trainer = Trainer(logger=mlf_logger) .. seealso:: :class:`~pytorch_lightning.loggers.MLFlowLogger` docs. @@ -82,29 +85,27 @@ First, install the package: Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`: -.. doctest:: +.. testcode:: - >>> from pytorch_lightning import Trainer - >>> from pytorch_lightning.loggers import NeptuneLogger - >>> neptune_logger = NeptuneLogger( - ... api_key='ANONYMOUS', # replace with your own - ... project_name='shared/pytorch-lightning-integration', - ... experiment_name='default', # Optional, - ... params={'max_epochs': 10}, # Optional, - ... tags=['pytorch-lightning', 'mlp'], # Optional, - ... ) - >>> trainer = Trainer(logger=neptune_logger) + from pytorch_lightning.loggers import NeptuneLogger + neptune_logger = NeptuneLogger( + api_key='ANONYMOUS', # replace with your own + project_name='shared/pytorch-lightning-integration', + experiment_name='default', # Optional, + params={'max_epochs': 10}, # Optional, + tags=['pytorch-lightning', 'mlp'], # Optional, + ) + trainer = Trainer(logger=neptune_logger) The :class:`~pytorch_lightning.loggers.NeptuneLogger` is available anywhere except ``__init__`` in your :class:`~pytorch_lightning.core.lightning.LightningModule`. -.. doctest:: +.. testcode:: - >>> from pytorch_lightning import LightningModule - >>> class MyModule(LightningModule): - ... def any_lightning_module_function_or_hook(self): - ... some_img = fake_image() - ... self.logger.experiment.add_image('generated_images', some_img, 0) + class MyModule(LightningModule): + def any_lightning_module_function_or_hook(self): + some_img = fake_image() + self.logger.experiment.add_image('generated_images', some_img, 0) .. seealso:: :class:`~pytorch_lightning.loggers.NeptuneLogger` docs. @@ -122,28 +123,31 @@ First, install the package: Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`: -.. doctest:: +.. testcode:: + + from pytorch_lightning.loggers import TrainsLogger + trains_logger = TrainsLogger( + project_name='examples', + task_name='pytorch lightning test', + ) + trainer = Trainer(logger=trains_logger) + +.. testoutput:: + :options: +ELLIPSIS, +NORMALIZE_WHITESPACE + :hide: - >>> from pytorch_lightning import Trainer - >>> from pytorch_lightning.loggers import TrainsLogger - >>> trains_logger = TrainsLogger( - ... project_name='examples', - ... task_name='pytorch lightning test', - ... ) # doctest: +ELLIPSIS TRAINS Task: ... TRAINS results page: ... - >>> trainer = Trainer(logger=trains_logger) The :class:`~pytorch_lightning.loggers.TrainsLogger` is available anywhere in your :class:`~pytorch_lightning.core.lightning.LightningModule`. -.. doctest:: +.. testcode:: - >>> from pytorch_lightning import LightningModule - >>> class MyModule(LightningModule): - ... def __init__(self): - ... some_img = fake_image() - ... self.logger.experiment.log_image('debug', 'generated_image_0', some_img, 0) + class MyModule(LightningModule): + def __init__(self): + some_img = fake_image() + self.logger.experiment.log_image('debug', 'generated_image_0', some_img, 0) .. seealso:: :class:`~pytorch_lightning.loggers.TrainsLogger` docs. @@ -153,23 +157,21 @@ Tensorboard To use `TensorBoard `_ as your logger do the following. -.. doctest:: +.. testcode:: - >>> from pytorch_lightning import Trainer - >>> from pytorch_lightning.loggers import TensorBoardLogger - >>> logger = TensorBoardLogger('tb_logs', name='my_model') - >>> trainer = Trainer(logger=logger) + from pytorch_lightning.loggers import TensorBoardLogger + logger = TensorBoardLogger('tb_logs', name='my_model') + trainer = Trainer(logger=logger) The :class:`~pytorch_lightning.loggers.TensorBoardLogger` is available anywhere except ``__init__`` in your :class:`~pytorch_lightning.core.lightning.LightningModule`. -.. doctest:: +.. testcode:: - >>> from pytorch_lightning import LightningModule - >>> class MyModule(LightningModule): - ... def any_lightning_module_function_or_hook(self): - ... some_img = fake_image() - ... self.logger.experiment.add_image('generated_images', some_img, 0) + class MyModule(LightningModule): + def any_lightning_module_function_or_hook(self): + some_img = fake_image() + self.logger.experiment.add_image('generated_images', some_img, 0) .. seealso:: :class:`~pytorch_lightning.loggers.TensorBoardLogger` docs. @@ -188,22 +190,21 @@ First, install the package: Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`: -.. doctest:: +.. testcode:: - >>> from pytorch_lightning.loggers import TestTubeLogger - >>> logger = TestTubeLogger('tb_logs', name='my_model') - >>> trainer = Trainer(logger=logger) + from pytorch_lightning.loggers import TestTubeLogger + logger = TestTubeLogger('tb_logs', name='my_model') + trainer = Trainer(logger=logger) The :class:`~pytorch_lightning.loggers.TestTubeLogger` is available anywhere except ``__init__`` in your :class:`~pytorch_lightning.core.lightning.LightningModule`. -.. doctest:: +.. testcode:: - >>> from pytorch_lightning import LightningModule - >>> class MyModule(LightningModule): - ... def any_lightning_module_function_or_hook(self): - ... some_img = fake_image() - ... self.logger.experiment.add_image('generated_images', some_img, 0) + class MyModule(LightningModule): + def any_lightning_module_function_or_hook(self): + some_img = fake_image() + self.logger.experiment.add_image('generated_images', some_img, 0) .. seealso:: :class:`~pytorch_lightning.loggers.TestTubeLogger` docs. @@ -221,24 +222,23 @@ First, install the package: Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`: -.. doctest:: +.. testcode:: - >>> from pytorch_lightning.loggers import WandbLogger - >>> wandb_logger = WandbLogger() - >>> trainer = Trainer(logger=wandb_logger) + from pytorch_lightning.loggers import WandbLogger + wandb_logger = WandbLogger() + trainer = Trainer(logger=wandb_logger) The :class:`~pytorch_lightning.loggers.WandbLogger` is available anywhere except ``__init__`` in your :class:`~pytorch_lightning.core.lightning.LightningModule`. -.. doctest:: +.. testcode:: - >>> from pytorch_lightning import LightningModule - >>> class MyModule(LightningModule): - ... def any_lightning_module_function_or_hook(self): - ... some_img = fake_image() - ... self.logger.experiment.log({ - ... "generated_images": [wandb.Image(some_img, caption="...")] - ... }) + class MyModule(LightningModule): + def any_lightning_module_function_or_hook(self): + some_img = fake_image() + self.logger.experiment.log({ + "generated_images": [wandb.Image(some_img, caption="...")] + }) .. seealso:: :class:`~pytorch_lightning.loggers.WandbLogger` docs. @@ -249,23 +249,22 @@ Multiple Loggers Lightning supports the use of multiple loggers, just pass a list to the :class:`~pytorch_lightning.trainer.trainer.Trainer`. -.. doctest:: +.. testcode:: - >>> from pytorch_lightning.loggers import TensorBoardLogger, TestTubeLogger - >>> logger1 = TensorBoardLogger('tb_logs', name='my_model') - >>> logger2 = TestTubeLogger('tb_logs', name='my_model') - >>> trainer = Trainer(logger=[logger1, logger2]) + from pytorch_lightning.loggers import TensorBoardLogger, TestTubeLogger + logger1 = TensorBoardLogger('tb_logs', name='my_model') + logger2 = TestTubeLogger('tb_logs', name='my_model') + trainer = Trainer(logger=[logger1, logger2]) The loggers are available as a list anywhere except ``__init__`` in your :class:`~pytorch_lightning.core.lightning.LightningModule`. -.. doctest:: +.. testcode:: - >>> from pytorch_lightning import LightningModule - >>> class MyModule(LightningModule): - ... def any_lightning_module_function_or_hook(self): - ... some_img = fake_image() - ... # Option 1 - ... self.logger.experiment[0].add_image('generated_images', some_img, 0) - ... # Option 2 - ... self.logger[0].experiment.add_image('generated_images', some_img, 0) + class MyModule(LightningModule): + def any_lightning_module_function_or_hook(self): + some_img = fake_image() + # Option 1 + self.logger.experiment[0].add_image('generated_images', some_img, 0) + # Option 2 + self.logger[0].experiment.add_image('generated_images', some_img, 0) diff --git a/docs/source/experiment_reporting.rst b/docs/source/experiment_reporting.rst index 0353fcd7a9e03..8e534f4cc6d26 100644 --- a/docs/source/experiment_reporting.rst +++ b/docs/source/experiment_reporting.rst @@ -1,3 +1,8 @@ +.. testsetup:: * + + from pytorch_lightning.trainer.trainer import Trainer + + Experiment Reporting ===================== @@ -11,10 +16,10 @@ Control logging frequency It may slow training down to log every single batch. Trainer has an option to log every k batches instead. -.. code-block:: python +.. testcode:: - # k = 10 - Trainer(row_log_interval=10) + k = 10 + trainer = Trainer(row_log_interval=k) Control log writing frequency ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -25,10 +30,10 @@ want to log using this trainer flag. .. seealso:: :class:`~pytorch_lightning.trainer.trainer.Trainer` -.. code-block:: python +.. testcode:: - k = 100 - Trainer(log_save_interval=k) + k = 100 + trainer = Trainer(log_save_interval=k) Log metrics ^^^^^^^^^^^ @@ -37,46 +42,47 @@ To plot metrics into whatever logger you passed in (tensorboard, comet, neptune, 1. training_epoch_end, validation_epoch_end, test_epoch_end will all log anything in the "log" key of the return dict. -.. code-block:: python +.. testcode:: - def training_epoch_end(self, outputs): - loss = some_loss() - ... + def training_epoch_end(self, outputs): + loss = some_loss() + ... - logs = {'train_loss': loss} - results = {'log': logs} - return results + logs = {'train_loss': loss} + results = {'log': logs} + return results - def validation_epoch_end(self, outputs): - loss = some_loss() - ... + def validation_epoch_end(self, outputs): + loss = some_loss() + ... - logs = {'val_loss': loss} - results = {'log': logs} - return results + logs = {'val_loss': loss} + results = {'log': logs} + return results - def test_epoch_end(self, outputs): - loss = some_loss() - ... + def test_epoch_end(self, outputs): + loss = some_loss() + ... - logs = {'test_loss': loss} - results = {'log': logs} - return results + logs = {'test_loss': loss} + results = {'log': logs} + return results 2. In addition, you can also use any arbitrary functionality from a particular logger from within your LightningModule. For instance, here we log images using tensorboard. -.. code-block:: python +.. testcode:: + :skipif: not TORCHVISION_AVAILABLE - def training_step(self, batch, batch_idx): - self.generated_imgs = self.decoder.generate() + def training_step(self, batch, batch_idx): + self.generated_imgs = self.decoder.generate() - sample_imgs = self.generated_imgs[:6] - grid = torchvision.utils.make_grid(sample_imgs) - self.logger.experiment.add_image('generated_images', grid, 0) + sample_imgs = self.generated_imgs[:6] + grid = torchvision.utils.make_grid(sample_imgs) + self.logger.experiment.add_image('generated_images', grid, 0) - ... - return results + ... + return results Modify progress bar ^^^^^^^^^^^^^^^^^^^ @@ -86,15 +92,15 @@ a key called "progress_bar". Here we show the validation loss in the progress bar -.. code-block:: python +.. testcode:: - def validation_epoch_end(self, outputs): - loss = some_loss() - ... + def validation_epoch_end(self, outputs): + loss = some_loss() + ... - logs = {'val_loss': loss} - results = {'progress_bar': logs} - return results + logs = {'val_loss': loss} + results = {'progress_bar': logs} + return results Snapshot hyperparameters ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -103,8 +109,8 @@ When Lightning creates a checkpoint, it stores a key "hparams" with the hyperpar .. code-block:: python - lightning_checkpoint = torch.load(filepath, map_location=lambda storage, loc: storage) - hyperparams = lightning_checkpoint['hparams'] + lightning_checkpoint = torch.load(filepath, map_location=lambda storage, loc: storage) + hyperparams = lightning_checkpoint['hparams'] Some loggers also allow logging the hyperparams used in the experiment. For instance, when using the TestTubeLogger or the TensorBoardLogger, all hyperparams will show @@ -115,8 +121,7 @@ Snapshot code Loggers also allow you to snapshot a copy of the code used in this experiment. For example, TestTubeLogger does this with a flag: -.. code-block:: python - - from pytorch_lightning.loggers import TestTubeLogger +.. testcode:: - logger = TestTubeLogger(create_git_tag=True) + from pytorch_lightning.loggers import TestTubeLogger + logger = TestTubeLogger('.', create_git_tag=True) diff --git a/docs/source/fast_training.rst b/docs/source/fast_training.rst index 970e9486173e1..208838f58b07c 100644 --- a/docs/source/fast_training.rst +++ b/docs/source/fast_training.rst @@ -1,3 +1,8 @@ +.. testsetup:: * + + from pytorch_lightning.trainer.trainer import Trainer + + Fast Training ============= There are multiple options to speed up different parts of the training by choosing to train @@ -7,7 +12,7 @@ Check validation every n epochs ------------------------------- If you have a small dataset you might want to check validation every n epochs -.. code-block:: python +.. testcode:: # DEFAULT trainer = Trainer(check_val_every_n_epoch=1) @@ -19,7 +24,7 @@ It can be useful to force training for a minimum number of epochs or limit to a .. seealso:: :class:`~pytorch_lightning.trainer.trainer.Trainer` -.. code-block:: python +.. testcode:: # DEFAULT trainer = Trainer(min_epochs=1, max_epochs=1000) @@ -31,7 +36,7 @@ For large datasets it's often desirable to check validation multiple times withi Pass in a float to check that often within 1 training epoch. Pass in an int k to check every k training batches. Must use an int if using an IterableDataset. -.. code-block:: python +.. testcode:: # DEFAULT trainer = Trainer(val_check_interval=0.95) @@ -46,21 +51,21 @@ Use data subset for training, validation and test ------------------------------------------------- If you don't want to check 100% of the training/validation/test set (for debugging or if it's huge), set these flags. -.. code-block:: python - - # DEFAULT - trainer = Trainer( - train_percent_check=1.0, - val_percent_check=1.0, - test_percent_check=1.0 - ) - - # check 10%, 20%, 30% only, respectively for training, validation and test set - trainer = Trainer( - train_percent_check=0.1, - val_percent_check=0.2, - test_percent_check=0.3 - ) +.. testcode:: + + # DEFAULT + trainer = Trainer( + train_percent_check=1.0, + val_percent_check=1.0, + test_percent_check=1.0 + ) + + # check 10%, 20%, 30% only, respectively for training, validation and test set + trainer = Trainer( + train_percent_check=0.1, + val_percent_check=0.2, + test_percent_check=0.3 + ) .. note:: ``train_percent_check``, ``val_percent_check`` and ``test_percent_check`` will be overwritten by ``overfit_pct`` if ``overfit_pct`` > 0. ``val_percent_check`` will be ignored if ``fast_dev_run=True``. diff --git a/docs/source/hyperparameters.rst b/docs/source/hyperparameters.rst index a1364b5084156..5b2dd343fb622 100644 --- a/docs/source/hyperparameters.rst +++ b/docs/source/hyperparameters.rst @@ -1,3 +1,13 @@ +.. testsetup:: * + + import torch + from argparse import ArgumentParser, Namespace + from pytorch_lightning.trainer.trainer import Trainer + from pytorch_lightning.core.lightning import LightningModule + import sys + sys.argv = ['foo'] + + Hyperparameters --------------- Lightning has utilities to interact seamlessly with the command line ArgumentParser @@ -7,13 +17,11 @@ ArgumentParser ^^^^^^^^^^^^^^ Lightning is designed to augment a lot of the functionality of the built-in Python ArgumentParser -.. code-block:: python +.. testcode:: from argparse import ArgumentParser - parser = ArgumentParser() parser.add_argument('--layer_1_dim', type=int, default=128) - args = parser.parse_args() This allows you to call your program like so: @@ -35,9 +43,9 @@ We can do this as follows. First, in your LightningModule, define the arguments specific to that module. Remember that data splits or data paths may also be specific to a module (ie: if your project has a model that trains on Imagenet and another on CIFAR-10). -.. code-block:: python +.. testcode:: - class LitModel(LightningModule): + class LitModel(LightningModule): @staticmethod def add_model_specific_args(parent_parser): @@ -48,13 +56,12 @@ a module (ie: if your project has a model that trains on Imagenet and another on Now in your main trainer file, add the Trainer args, the program args, and add the model args -.. code-block:: python +.. testcode:: # ---------------- # trainer_main.py # ---------------- from argparse import ArgumentParser - parser = ArgumentParser() # add PROGRAM level args @@ -66,7 +73,7 @@ Now in your main trainer file, add the Trainer args, the program args, and add t # add all the available trainer options to argparse # ie: now --gpus --num_nodes ... --fast_dev_run all work in the cli - parser = pl.Trainer.add_argparse_args(parser) + parser = Trainer.add_argparse_args(parser) hparams = parser.parse_args() @@ -78,9 +85,7 @@ Now you can call run your program like so Finally, make sure to start the training like so: -.. code-block:: bash - - hparams = parser.parse_args() +.. code-block:: python # YES model = LitModel(hparams) @@ -88,59 +93,56 @@ Finally, make sure to start the training like so: # NO # model = LitModel(learning_rate=hparams.learning_rate, ...) - #trainer = Trainer(gpus=hparams.gpus, ...) - + # trainer = Trainer(gpus=hparams.gpus, ...) -LightiningModule hparams -^^^^^^^^^^^^^^^^^^^^^^^^ +LightningModule hparams +^^^^^^^^^^^^^^^^^^^^^^^ Normally, we don't hard-code the values to a model. We usually use the command line to modify the network and read those values in the LightningModule -.. code-block:: python +.. testcode:: - class LitMNIST(pl.LightningModule): - def __init__(self, hparams): - super().__init__() + class LitMNIST(LightningModule): - # do this to save all arguments in any logger (tensorboard) - self.hparams = hparams - - self.layer_1 = torch.nn.Linear(28 * 28, hparams.layer_1_dim) - self.layer_2 = torch.nn.Linear(hparams.layer_1_dim, hparams.layer_2_dim) - self.layer_3 = torch.nn.Linear(hparams.layer_2_dim, 10) + def __init__(self, hparams): + super().__init__() - def forward(self, x): - ... + # do this to save all arguments in any logger (tensorboard) + self.hparams = hparams - def train_dataloader(self): - ... - return DataLoader(mnist_train, batch_size=self.hparams.batch_size) + self.layer_1 = torch.nn.Linear(28 * 28, hparams.layer_1_dim) + self.layer_2 = torch.nn.Linear(hparams.layer_1_dim, hparams.layer_2_dim) + self.layer_3 = torch.nn.Linear(hparams.layer_2_dim, 10) - def configure_optimizers(self): - return Adam(self.parameters(), lr=self.hparams.learning_rate) + def train_dataloader(self): + return DataLoader(mnist_train, batch_size=self.hparams.batch_size) - @staticmethod - def add_model_specific_args(parent_parser): - parser = ArgumentParser(parents=[parent_parser], add_help=False) + def configure_optimizers(self): + return Adam(self.parameters(), lr=self.hparams.learning_rate) - parser.add_argument('--layer_1_dim', type=int, default=128) - parser.add_argument('--layer_2_dim', type=int, default=256) - parser.add_argument('--batch_size', type=int, default=64) - parser.add_argument('--learning_rate', type=float, default=0.002) - return parser + @staticmethod + def add_model_specific_args(parent_parser): + parser = ArgumentParser(parents=[parent_parser], add_help=False) + parser.add_argument('--layer_1_dim', type=int, default=128) + parser.add_argument('--layer_2_dim', type=int, default=256) + parser.add_argument('--batch_size', type=int, default=64) + parser.add_argument('--learning_rate', type=float, default=0.002) + return parser Now pass in the params when you init your model .. code-block:: python - hparams = parse_args() + parser = ArgumentParser() + parser = LitMNIST.add_model_specific_args(parser) + hparams = parser.parse_args() model = LitMNIST(hparams) The line `self.hparams = hparams` is very special. This line assigns your hparams to the LightningModule. This does two things: -1. It adds them automatically to tensorboard logs under the hparams tab. +1. It adds them automatically to TensorBoard logs under the hparams tab. 2. Lightning will save those hparams to the checkpoint and use them to restore the module correctly. Trainer args @@ -165,9 +167,10 @@ Multiple Lightning Modules We often have multiple Lightning Modules where each one has different arguments. Instead of polluting the main.py file, the LightningModule lets you define arguments for each one. -.. code-block:: python +.. testcode:: + + class LitMNIST(LightningModule): - class LitMNIST(pl.LightningModule): def __init__(self, hparams): super().__init__() self.layer_1 = torch.nn.Linear(28 * 28, hparams.layer_1_dim) @@ -178,7 +181,10 @@ polluting the main.py file, the LightningModule lets you define arguments for ea parser.add_argument('--layer_1_dim', type=int, default=128) return parser - class GoodGAN(pl.LightningModule): +.. testcode:: + + class GoodGAN(LightningModule): + def __init__(self, hparams): super().__init__() self.encoder = Encoder(layers=hparams.encoder_layers) @@ -189,7 +195,8 @@ polluting the main.py file, the LightningModule lets you define arguments for ea parser.add_argument('--encoder_layers', type=int, default=12) return parser -Now we can allow each model to inject the arguments it needs in the main.py + +Now we can allow each model to inject the arguments it needs in the ``main.py`` .. code-block:: python @@ -226,7 +233,7 @@ Now we can allow each model to inject the arguments it needs in the main.py # train main(args) -and now we can train MNIST or the gan using the command line interface! +and now we can train MNIST or the GAN using the command line interface! .. code-block:: bash diff --git a/docs/source/introduction_guide.rst b/docs/source/introduction_guide.rst index a7a406bbcb68d..5d26278483c39 100644 --- a/docs/source/introduction_guide.rst +++ b/docs/source/introduction_guide.rst @@ -1,3 +1,9 @@ +.. testsetup:: * + + from pytorch_lightning.core.lightning import LightningModule + from pytorch_lightning.trainer.trainer import Trainer + + Introduction Guide ================== PyTorch Lightning provides a very simple template for organizing your PyTorch code. Once @@ -126,14 +132,14 @@ The LightningModule provides the structure on how to organize these 5 ingredient Let's first start with the model. In this case we'll design a 3-layer neural network. -.. code-block:: default +.. testcode:: import torch from torch.nn import functional as F from torch import nn - import pytorch_lightning as pl + from pytorch_lightning.core.lightning import LightningModule - class LitMNIST(pl.LightningModule): + class LitMNIST(LightningModule): def __init__(self): super().__init__() @@ -169,7 +175,7 @@ Notice this is a `LightningModule` instead of a `torch.nn.Module`. A LightningMo equivalent to a PyTorch Module except it has added functionality. However, you can use it EXACTLY the same as you would a PyTorch Module. -.. code-block:: default +.. testcode:: net = LitMNIST() x = torch.Tensor(1, 1, 28, 28) @@ -189,14 +195,14 @@ Data The Lightning Module organizes your dataloaders and data processing as well. Here's the PyTorch code for loading MNIST -.. code-block:: default +.. testcode:: + :skipif: not TORCHVISION_AVAILABLE from torch.utils.data import DataLoader, random_split from torchvision.datasets import MNIST import os from torchvision import datasets, transforms - # transforms # prepare transforms standard to MNIST transform=transforms.Compose([transforms.ToTensor(), @@ -206,24 +212,38 @@ Here's the PyTorch code for loading MNIST mnist_train = MNIST(os.getcwd(), train=True, download=True) mnist_train = DataLoader(mnist_train, batch_size=64) +.. testoutput:: + :hide: + :skipif: os.path.isdir(os.path.join(os.getcwd(), 'MNIST')) or not TORCHVISION_AVAILABLE + + Downloading ... + Extracting ... + Downloading ... + Extracting ... + Downloading ... + Extracting ... + Processing... + Done! + When using PyTorch Lightning, we use the exact same code except we organize it into the LightningModule -.. code-block:: python +.. testcode:: + :skipif: not TORCHVISION_AVAILABLE from torch.utils.data import DataLoader, random_split from torchvision.datasets import MNIST import os from torchvision import datasets, transforms - class LitMNIST(pl.LightningModule): + class LitMNIST(LightningModule): - def train_dataloader(self): - transform=transforms.Compose([transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,))]) - mnist_train = MNIST(os.getcwd(), train=True, download=False, - transform=transform) - return DataLoader(mnist_train, batch_size=64) + def train_dataloader(self): + transform=transforms.Compose([transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,))]) + mnist_train = MNIST(os.getcwd(), train=True, download=False, + transform=transform) + return DataLoader(mnist_train, batch_size=64) Notice the code is exactly the same, except now the training dataloading has been organized by the LightningModule under the `train_dataloader` method. This is great because if you run into a project that uses Lightning and want @@ -232,21 +252,21 @@ to figure out how they prepare their training data you can just look in the `tra Usually though, we want to separate the things that write to disk in data-processing from things like transforms which happen in memory. -.. code-block:: python +.. testcode:: - class LitMNIST(pl.LightningModule): + class LitMNIST(LightningModule): - def prepare_data(self): - # download only - MNIST(os.getcwd(), train=True, download=True) + def prepare_data(self): + # download only + MNIST(os.getcwd(), train=True, download=True) - def train_dataloader(self): - # no download, just transform - transform=transforms.Compose([transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,))]) - mnist_train = MNIST(os.getcwd(), train=True, download=False, - transform=transform) - return DataLoader(mnist_train, batch_size=64) + def train_dataloader(self): + # no download, just transform + transform=transforms.Compose([transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,))]) + mnist_train = MNIST(os.getcwd(), train=True, download=False, + transform=transform) + return DataLoader(mnist_train, batch_size=64) Doing it in the `prepare_data` method ensures that when you have multiple GPUs you won't overwrite the data. This is a contrived example @@ -254,24 +274,24 @@ but it gets more complicated with things like NLP or Imagenet. In general fill these methods with the following: -.. code-block:: python - - class LitMNIST(pl.LightningModule): +.. testcode:: - def prepare_data(self): - # stuff here is done once at the very beginning of training - # before any distributed training starts + class LitMNIST(LightningModule): - # download stuff - # save to disk - # etc... - - def train_dataloader(self): - # data transforms - # dataset creation - # return a DataLoader + def prepare_data(self): + # stuff here is done once at the very beginning of training + # before any distributed training starts + # download stuff + # save to disk + # etc... + ... + def train_dataloader(self): + # data transforms + # dataset creation + # return a DataLoader + ... Optimizer ^^^^^^^^^ @@ -287,20 +307,20 @@ In PyTorch we do it as follows: In Lightning we do the same but organize it under the configure_optimizers method. -.. code-block:: python +.. testcode:: - class LitMNIST(pl.LightningModule): + class LitMNIST(LightningModule): - def configure_optimizers(self): - return Adam(self.parameters(), lr=1e-3) + def configure_optimizers(self): + return Adam(self.parameters(), lr=1e-3) .. note:: The LightningModule itself has the parameters, so pass in self.parameters() However, if you have multiple optimizers use the matching parameters -.. code-block:: python +.. testcode:: - class LitMNIST(pl.LightningModule): + class LitMNIST(LightningModule): def configure_optimizers(self): return Adam(self.generator(), lr=1e-3), Adam(self.discriminator(), lr=1e-3) @@ -340,16 +360,16 @@ In the case of MNIST we do the following In Lightning, everything that is in the training step gets organized under the `training_step` function in the LightningModule -.. code-block:: python +.. testcode:: - class LitMNIST(pl.LightningModule): + class LitMNIST(LightningModule): - def training_step(self, batch, batch_idx): - x, y = batch - logits = self(x) - loss = F.nll_loss(logits, y) - return {'loss': loss} - # return loss (also works) + def training_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + return {'loss': loss} + # return loss (also works) Again, this is the same PyTorch code except that it has been organized by the LightningModule. This code is not restricted which means it can be as complicated as a full seq-2-seq, RL loop, GAN, etc... @@ -367,43 +387,43 @@ So far we defined 4 key ingredients in pure PyTorch but organized the code insid For clarity, we'll recall that the full LightningModule now looks like this. -.. code-block:: python +.. testcode:: + + class LitMNIST(LightningModule): + def __init__(self): + super().__init__() + self.layer_1 = torch.nn.Linear(28 * 28, 128) + self.layer_2 = torch.nn.Linear(128, 256) + self.layer_3 = torch.nn.Linear(256, 10) + + def forward(self, x): + batch_size, channels, width, height = x.size() + x = x.view(batch_size, -1) + x = self.layer_1(x) + x = torch.relu(x) + x = self.layer_2(x) + x = torch.relu(x) + x = self.layer_3(x) + x = torch.log_softmax(x, dim=1) + return x + + def train_dataloader(self): + transform=transforms.Compose([transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,))]) + mnist_train = MNIST(os.getcwd(), train=True, download=False, transform=transform) + return DataLoader(mnist_train, batch_size=64) - class LitMNIST(pl.LightningModule): - def __init__(self): - super().__init__() - self.layer_1 = torch.nn.Linear(28 * 28, 128) - self.layer_2 = torch.nn.Linear(128, 256) - self.layer_3 = torch.nn.Linear(256, 10) - - def forward(self, x): - batch_size, channels, width, height = x.size() - x = x.view(batch_size, -1) - x = self.layer_1(x) - x = torch.relu(x) - x = self.layer_2(x) - x = torch.relu(x) - x = self.layer_3(x) - x = torch.log_softmax(x, dim=1) - return x - - def train_dataloader(self): - transform=transforms.Compose([transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,))]) - mnist_train = MNIST(os.getcwd(), train=True, download=False, transform=transform) - return DataLoader(mnist_train, batch_size=64) - - def configure_optimizers(self): - return Adam(self.parameters(), lr=1e-3) + def configure_optimizers(self): + return Adam(self.parameters(), lr=1e-3) - def training_step(self, batch, batch_idx): - x, y = batch - logits = self(x) - loss = F.nll_loss(logits, y) + def training_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) - # add logging - logs = {'loss': loss} - return {'loss': loss, 'log': logs} + # add logging + logs = {'loss': loss} + return {'loss': loss, 'log': logs} Again, this is the same PyTorch code, except that it's organized by the LightningModule. This organization now lets us train this model @@ -551,33 +571,33 @@ will cause all sorts of issues. To solve this problem, move the download code to the `prepare_data` method in the LightningModule. In this method we do all the preparation we need to do once (instead of on every gpu). -.. code-block:: python +.. testcode:: - class LitMNIST(pl.LightningModule): - def prepare_data(self): - # transform - transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]) + class LitMNIST(LightningModule): + def prepare_data(self): + # transform + transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]) - # download - mnist_train = MNIST(os.getcwd(), train=True, download=True, transform=transform) - mnist_test = MNIST(os.getcwd(), train=False, download=True, transform=transform) + # download + mnist_train = MNIST(os.getcwd(), train=True, download=True, transform=transform) + mnist_test = MNIST(os.getcwd(), train=False, download=True, transform=transform) - # train/val split - mnist_train, mnist_val = random_split(mnist_train, [55000, 5000]) + # train/val split + mnist_train, mnist_val = random_split(mnist_train, [55000, 5000]) - # assign to use in dataloaders - self.train_dataset = mnist_train - self.val_dataset = mnist_val - self.test_dataset = mnist_test + # assign to use in dataloaders + self.train_dataset = mnist_train + self.val_dataset = mnist_val + self.test_dataset = mnist_test - def train_dataloader(self): - return DataLoader(self.train_dataset, batch_size=64) + def train_dataloader(self): + return DataLoader(self.train_dataset, batch_size=64) - def val_dataloader(self): - return DataLoader(self.val_dataset, batch_size=64) + def val_dataloader(self): + return DataLoader(self.val_dataset, batch_size=64) - def test_dataloader(self): - return DataLoader(self.test_dataset, batch_size=64) + def test_dataloader(self): + return DataLoader(self.test_dataset, batch_size=64) The `prepare_data` method is also a good place to do any data processing that needs to be done only once (ie: download or tokenize, etc...). @@ -642,28 +662,28 @@ In addition, we define a `val_dataloader` method which tells the trainer what da Notice we split the train split of MNIST into train, validation. We also have to make sure to do the sample split in the `train_dataloader` method. -.. code-block:: python +.. testcode:: - class LitMNIST(pl.LightningModule): - def validation_step(self, batch, batch_idx): - x, y = batch - logits = self(x) - loss = F.nll_loss(logits, y) - return {'val_loss': loss} - - def validation_epoch_end(self, outputs): - avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() - tensorboard_logs = {'val_loss': avg_loss} - return {'val_loss': avg_loss, 'log': tensorboard_logs} - - def val_dataloader(self): - transform=transforms.Compose([transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,))]) - mnist_train = MNIST(os.getcwd(), train=True, download=False, - transform=transform) - _, mnist_val = random_split(mnist_train, [55000, 5000]) - mnist_val = DataLoader(mnist_val, batch_size=64) - return mnist_val + class LitMNIST(LightningModule): + def validation_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + return {'val_loss': loss} + + def validation_epoch_end(self, outputs): + avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() + tensorboard_logs = {'val_loss': avg_loss} + return {'val_loss': avg_loss, 'log': tensorboard_logs} + + def val_dataloader(self): + transform=transforms.Compose([transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,))]) + mnist_train = MNIST(os.getcwd(), train=True, download=False, + transform=transform) + _, mnist_val = random_split(mnist_train, [55000, 5000]) + mnist_val = DataLoader(mnist_val, batch_size=64) + return mnist_val Again, we've just organized the regular PyTorch code into two steps, the `validation_step` method which operates on a single batch and the `validation_epoch_end` method to compute statistics on all batches. @@ -698,26 +718,26 @@ Just like the validation loop, we define exactly the same steps for testing: - test_epoch_end - test_dataloader -.. code-block:: python +.. testcode:: - class LitMNIST(pl.LightningModule): - def test_step(self, batch, batch_idx): - x, y = batch - logits = self(x) - loss = F.nll_loss(logits, y) - return {'val_loss': loss} - - def test_epoch_end(self, outputs): - avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() - tensorboard_logs = {'val_loss': avg_loss} - return {'val_loss': avg_loss, 'log': tensorboard_logs} - - def test_dataloader(self): - transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]) - mnist_train = MNIST(os.getcwd(), train=False, download=False, transform=transform) - _, mnist_val = random_split(mnist_train, [55000, 5000]) - mnist_val = DataLoader(mnist_val, batch_size=64) - return mnist_val + class LitMNIST(LightningModule): + def test_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + return {'val_loss': loss} + + def test_epoch_end(self, outputs): + avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() + tensorboard_logs = {'val_loss': avg_loss} + return {'val_loss': avg_loss, 'log': tensorboard_logs} + + def test_dataloader(self): + transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]) + mnist_train = MNIST(os.getcwd(), train=False, download=False, transform=transform) + _, mnist_val = random_split(mnist_train, [55000, 5000]) + mnist_val = DataLoader(mnist_val, batch_size=64) + return mnist_val However, to make sure the test set isn't used inadvertently, Lightning has a separate API to run tests. Once you train your model simply call `.test()`. @@ -773,26 +793,26 @@ On the surface, it looks like `forward` and `training_step` are similar. General what we want the model to do is what happens in the `forward`. whereas the `training_step` likely calls forward from within it. -.. code-block:: python +.. testcode:: - class MNISTClassifier(pl.LightningModule): + class MNISTClassifier(LightningModule): - def forward(self, x): - batch_size, channels, width, height = x.size() - x = x.view(batch_size, -1) - x = self.layer_1(x) - x = torch.relu(x) - x = self.layer_2(x) - x = torch.relu(x) - x = self.layer_3(x) - x = torch.log_softmax(x, dim=1) - return x + def forward(self, x): + batch_size, channels, width, height = x.size() + x = x.view(batch_size, -1) + x = self.layer_1(x) + x = torch.relu(x) + x = self.layer_2(x) + x = torch.relu(x) + x = self.layer_3(x) + x = torch.log_softmax(x, dim=1) + return x - def training_step(self, batch, batch_idx): - x, y = batch - logits = self(x) - loss = F.nll_loss(logits, y) - return loss + def training_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + return loss .. code-block:: python @@ -802,27 +822,27 @@ within it. In this case, we've set this LightningModel to predict logits. But we could also have it predict feature maps: -.. code-block:: python +.. testcode:: - class MNISTRepresentator(pl.LightningModule): + class MNISTRepresentator(LightningModule): - def forward(self, x): - batch_size, channels, width, height = x.size() - x = x.view(batch_size, -1) - x = self.layer_1(x) - x1 = torch.relu(x) - x = self.layer_2(x1) - x2 = torch.relu(x) - x3 = self.layer_3(x2) - return [x, x1, x2, x3] - - def training_step(self, batch, batch_idx): - x, y = batch - out, l1_feats, l2_feats, l3_feats = self(x) - logits = torch.log_softmax(out, dim=1) - ce_loss = F.nll_loss(logits, y) - loss = perceptual_loss(l1_feats, l2_feats, l3_feats) + ce_loss - return loss + def forward(self, x): + batch_size, channels, width, height = x.size() + x = x.view(batch_size, -1) + x = self.layer_1(x) + x1 = torch.relu(x) + x = self.layer_2(x1) + x2 = torch.relu(x) + x3 = self.layer_3(x2) + return [x, x1, x2, x3] + + def training_step(self, batch, batch_idx): + x, y = batch + out, l1_feats, l2_feats, l3_feats = self(x) + logits = torch.log_softmax(out, dim=1) + ce_loss = F.nll_loss(logits, y) + loss = perceptual_loss(l1_feats, l2_feats, l3_feats) + ce_loss + return loss .. code-block:: python @@ -832,21 +852,21 @@ In this case, we've set this LightningModel to predict logits. But we could also Or maybe we have a model that we use to do generation -.. code-block:: python +.. testcode:: - class LitMNISTDreamer(pl.LightningModule): + class LitMNISTDreamer(LightningModule): - def forward(self, z): - imgs = self.decoder(z) - return imgs + def forward(self, z): + imgs = self.decoder(z) + return imgs - def training_step(self, batch, batch_idx): - x, y = batch - representation = self.encoder(x) - imgs = self(representation) + def training_step(self, batch, batch_idx): + x, y = batch + representation = self.encoder(x) + imgs = self(representation) - loss = perceptual_loss(imgs, x) - return loss + loss = perceptual_loss(imgs, x) + return loss .. code-block:: python @@ -871,7 +891,7 @@ Any part of the training, validation and testing loop can be modified. For instance, if you wanted to do your own backward pass, you would override the default implementation -.. code-block:: python +.. testcode:: def backward(self, use_amp, loss, optimizer): if use_amp: @@ -882,9 +902,9 @@ default implementation With your own -.. code-block:: python +.. testcode:: - class LitMNIST(pl.LightningModule): + class LitMNIST(LightningModule): def backward(self, use_amp, loss, optimizer): # do a custom way of backward @@ -892,7 +912,7 @@ With your own Or if you wanted to initialize ddp in a different way than the default one -.. code-block:: python +.. testcode:: def configure_ddp(self, model, device_ids): # Lightning DDP simply routes to test_step, val_step, etc... @@ -905,9 +925,9 @@ Or if you wanted to initialize ddp in a different way than the default one you could do your own: -.. code-block:: python +.. testcode:: - class LitMNIST(pl.LightningModule): + class LitMNIST(LightningModule): def configure_ddp(self, model, device_ids): @@ -916,7 +936,7 @@ you could do your own: return model Every single part of training is configurable this way. -For a full list look at `lightningModule `_. +For a full list look at `LightningModule `_. --------- @@ -925,26 +945,32 @@ Callbacks Another way to add arbitrary functionality is to add a custom callback for hooks that you might care about -.. code-block:: python +.. testcode:: - import pytorch_lightning as pl + from pytorch_lightning.callbacks import Callback - class MyPrintingCallback(pl.Callback): + class MyPrintingCallback(Callback): def on_init_start(self, trainer): print('Starting to init trainer!') def on_init_end(self, trainer): - print('trainer is init now') + print('Trainer is init now') def on_train_end(self, trainer, pl_module): print('do something when training ends') And pass the callbacks into the trainer -.. code-block:: python +.. testcode:: + + trainer = Trainer(callbacks=[MyPrintingCallback()]) + +.. testoutput:: + :hide: - Trainer(callbacks=[MyPrintingCallback()]) + Starting to init trainer! + Trainer is init now .. note:: See full list of 12+ hooks in the :ref:`callbacks`. diff --git a/docs/source/lr_finder.rst b/docs/source/lr_finder.rst index aab0c7548c4cf..3da5456b6de8b 100755 --- a/docs/source/lr_finder.rst +++ b/docs/source/lr_finder.rst @@ -1,3 +1,8 @@ +.. testsetup:: * + + from pytorch_lightning.trainer.trainer import Trainer + from pytorch_lightning.core.lightning import LightningModule + Learning Rate Finder -------------------- @@ -24,17 +29,18 @@ will automatically be run before any training is done. The ``lr`` that is found and used will be written to the console and logged together with all other hyperparameters of the model. -.. code-block:: python +.. testcode:: # default, no automatic learning rate finder - Trainer(auto_lr_find=True) + trainer = Trainer(auto_lr_find=True) When the ``lr`` or ``learning_rate`` key in hparams exists, this flag sets your learning_rate. In both cases, if the respective fields are not found, an error will be thrown. -.. code-block:: python +.. testcode:: class LitModel(LightningModule): + def __init__(self, hparams): self.hparams = hparams @@ -43,14 +49,14 @@ In both cases, if the respective fields are not found, an error will be thrown. # finds learning rate automatically # sets hparams.lr or hparams.learning_rate to that learning rate - Trainer(auto_lr_find=True) + trainer = Trainer(auto_lr_find=True) To use an arbitrary value set it in the parameter. -.. code-block:: python +.. testcode:: # to set to your own hparams.my_value - Trainer(auto_lr_find='my_value') + trainer = Trainer(auto_lr_find='my_value') Under the hood, when you call fit, this is what happens. @@ -72,7 +78,7 @@ of this would look like .. code-block:: python model = MyModelClass(hparams) - trainer = pl.Trainer() + trainer = Trainer() # Run learning rate finder lr_finder = trainer.lr_find(model) diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst index 55d9fdb5faac2..8688cd338bc1b 100644 --- a/docs/source/multi_gpu.rst +++ b/docs/source/multi_gpu.rst @@ -1,3 +1,9 @@ +.. testsetup:: * + + import torch + from pytorch_lightning.trainer.trainer import Trainer + from pytorch_lightning.core.lightning import LightningModule + .. _multi-gpu-training: Multi-GPU training @@ -13,7 +19,7 @@ Delete .cuda() or .to() calls Delete any calls to .cuda() or .to(device). -.. code-block:: python +.. testcode:: # before lightning def forward(self, x): @@ -30,7 +36,7 @@ Init using type_as When you need to create a new tensor, use `type_as`. This will make your code scale to any arbitrary number of GPUs or TPUs with Lightning -.. code-block:: python +.. testcode:: # before lightning def forward(self, x): @@ -47,7 +53,7 @@ Remove samplers For multi-node or TPU training, in PyTorch we must use `torch.nn.DistributedSampler`. The sampler makes sure each GPU sees the appropriate part of your data. -.. code-block:: python +.. testcode:: # without lightning def train_dataloader(self): @@ -62,7 +68,7 @@ sampler makes sure each GPU sees the appropriate part of your data. With Lightning, you don't need to do this because it takes care of adding the correct samplers when needed. -.. code-block:: python +.. testcode:: # with lightning def train_dataloader(self): @@ -131,10 +137,11 @@ each GPU will process 16 samples, after which the root node will aggregate the r .. warning:: DP use is discouraged by PyTorch and Lightning. Use ddp which is more stable and at least 3x faster -.. code-block:: python +.. testcode:: + :skipif: torch.cuda.device_count() < 2 - # train on 1 GPU (using dp mode) - trainer = pl.Trainer(gpus=2, distributed_backend='dp') + # train on 2 GPUs (using dp mode) + trainer = Trainer(gpus=2, distributed_backend='dp') Distributed Data Parallel ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -157,10 +164,10 @@ Distributed Data Parallel .. code-block:: python # train on 8 GPUs (same machine (ie: node)) - trainer = pl.Trainer(gpus=8, distributed_backend='ddp') + trainer = Trainer(gpus=8, distributed_backend='ddp') # train on 32 GPUs (4 nodes) - trainer = pl.Trainer(gpus=8, distributed_backend='ddp', num_nodes=4) + trainer = Trainer(gpus=8, distributed_backend='ddp', num_nodes=4) Distributed Data Parallel 2 ^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -182,7 +189,7 @@ In this case, we can use ddp2 which behaves like dp in a machine and ddp across .. code-block:: python # train on 32 GPUs (4 nodes) - trainer = pl.Trainer(gpus=8, distributed_backend='ddp2', num_nodes=4) + trainer = Trainer(gpus=8, distributed_backend='ddp2', num_nodes=4) Horovod ^^^^^^^ @@ -202,15 +209,15 @@ Horovod can be configured in the training script to run with any number of GPUs .. code-block:: python # train Horovod on GPU (number of GPUs / machines provided on command-line) - trainer = pl.Trainer(distributed_backend='horovod', gpus=1) + trainer = Trainer(distributed_backend='horovod', gpus=1) # train Horovod on CPU (number of processes / machines provided on command-line) - trainer = pl.Trainer(distributed_backend='horovod') + trainer = Trainer(distributed_backend='horovod') When starting the training job, the driver application will then be used to specify the total number of worker processes: -.. code-block:: +.. code-block:: bash # run training with 4 GPUs on a single machine horovodrun -np 4 python train.py @@ -226,7 +233,7 @@ DP/DDP2 caveats In DP and DDP2 each GPU within a machine sees a portion of a batch. DP and ddp2 roughly do the following: -.. code-block:: python +.. testcode:: def distributed_forward(batch, model): batch = torch.Tensor(32, 8) @@ -245,7 +252,7 @@ DP and ddp2 roughly do the following: So, when Lightning calls any of the `training_step`, `validation_step`, `test_step` you will only be operating on one of those pieces. -.. code-block:: python +.. testcode:: # the batch here is a portion of the FULL batch def training_step(self, batch, batch_idx): @@ -255,7 +262,7 @@ For most metrics, this doesn't really matter. However, if you want to add something to your computational graph (like softmax) using all batch parts you can use the `training_step_end` step. -.. code-block:: python +.. testcode:: def training_step_end(self, outputs): # only use when on dp @@ -288,7 +295,7 @@ In pseudocode, the full sequence is: to illustrate why this is needed, let's look at dataparallel -.. code-block:: python +.. testcode:: def training_step(self, batch, batch_idx): x, y = batch @@ -313,13 +320,13 @@ it will behave the same no matter the backend. Validation and test step also have the same option when using dp -.. code-block:: python +.. testcode:: - def validation_step_end(self, batch_parts_outputs): - ... + def validation_step_end(self, batch_parts_outputs): + ... - def test_step_end(self, batch_parts_outputs): - ... + def test_step_end(self, batch_parts_outputs): + ... Implement Your Own Distributed (DDP) training ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -335,7 +342,7 @@ batch size. Let's say you have a batch size of 7 in your dataloader. -.. code-block:: +.. testcode:: class LitModel(LightningModule): @@ -344,7 +351,7 @@ Let's say you have a batch size of 7 in your dataloader. In (DDP, Horovod) your effective batch size will be 7 * gpus * num_nodes. -.. code-block:: +.. code-block:: python # effective batch size = 7 * 8 Trainer(gpus=8, distributed_backend='ddp|horovod') @@ -356,7 +363,7 @@ In (DDP, Horovod) your effective batch size will be 7 * gpus * num_nodes. In DDP2, your effective batch size will be 7 * num_nodes. The reason is that the full batch is visible to all GPUs on the node when using DDP2. -.. code-block:: +.. code-block:: python # effective batch size = 7 Trainer(gpus=8, distributed_backend='ddp2') diff --git a/docs/source/multiple_loaders.rst b/docs/source/multiple_loaders.rst index e88b7b1cbe078..dca339f9b99ad 100644 --- a/docs/source/multiple_loaders.rst +++ b/docs/source/multiple_loaders.rst @@ -1,3 +1,7 @@ +.. testsetup:: * + + from pytorch_lightning.core.lightning import LightningModule + Multiple Datasets ================= Lightning supports multiple dataloaders in a few ways. @@ -14,7 +18,7 @@ dataloaders). (`reference `_) -.. code-block:: python +.. testcode:: class ConcatDataset(torch.utils.data.Dataset): def __init__(self, *datasets): @@ -27,6 +31,7 @@ dataloaders). return min(len(d) for d in self.datasets) class LitModel(LightningModule): + def train_dataloader(self): concat_dataset = ConcatDataset( datasets.ImageFolder(traindir_A), @@ -44,9 +49,11 @@ dataloaders). def val_dataloader(self): # SAME + ... def test_dataloader(self): # SAME + ... Test/Val dataloaders -------------------- @@ -58,7 +65,7 @@ See the following for more details: - :meth:`~pytorch_lightning.core.LightningModule.val_dataloader` - :meth:`~pytorch_lightning.core.LightningModule.test_dataloader` -.. code-block:: python +.. testcode:: def val_dataloader(self): loader_1 = Dataloader() diff --git a/docs/source/new-project.rst b/docs/source/new-project.rst index e3f3a892d983f..24b11412e5c7d 100644 --- a/docs/source/new-project.rst +++ b/docs/source/new-project.rst @@ -1,3 +1,10 @@ +.. testsetup:: * + + from pytorch_lightning.core.lightning import LightningModule + from pytorch_lightning.trainer.trainer import Trainer + + + Quick Start =========== @@ -13,7 +20,8 @@ To illustrate, here's the typical PyTorch project structure organized in a Light Step 1: Define a LightningModule --------------------------------- -.. code-block:: python +.. testcode:: + :skipif: not TORCHVISION_AVAILABLE import os @@ -22,10 +30,9 @@ Step 1: Define a LightningModule from torch.utils.data import DataLoader from torchvision.datasets import MNIST from torchvision import transforms + from pytorch_lightning.core.lightning import LightningModule - import pytorch_lightning as pl - - class LitModel(pl.LightningModule): + class LitModel(LightningModule): def __init__(self): super().__init__() @@ -53,7 +60,8 @@ Step 1: Define a LightningModule Step 2: Fit with a Trainer -------------------------- -.. code-block:: python +.. testcode:: + :skipif: torch.cuda.device_count() < 8 from pytorch_lightning import Trainer @@ -68,13 +76,13 @@ Under the hood, lightning does (in high-level pseudocode): .. code-block:: python model = LitModel() - train_dataloader = model.train_dataloader + train_dataloader = model.train_dataloader() optimizer = model.configure_optimizers() for epoch in epochs: train_outs = [] for batch in train_dataloader: - loss = model.training_step() + loss = model.training_step(batch) loss.backward() train_outs.append(loss.detach()) @@ -88,9 +96,9 @@ Validation loop --------------- To also add a validation loop add the following functions -.. code-block:: python +.. testcode:: - class LitModel(pl.LightningModule): + class LitModel(LightningModule): def validation_step(self, batch, batch_idx): x, y = batch @@ -118,7 +126,11 @@ And now the trainer will call the validation loop automatically Under the hood in pseudocode, lightning does the following: -.. code-block:: python +.. testsetup:: * + + train_dataloader = [] + +.. testcode:: # ... for batch in train_dataloader: @@ -145,9 +157,9 @@ Test loop --------- You might also need a test loop -.. code-block:: python +.. testcode:: - class LitModel(pl.LightningModule): + class LitModel(LightningModule): def test_step(self, batch, batch_idx): x, y = batch diff --git a/docs/source/optimizers.rst b/docs/source/optimizers.rst index 0b02f9c56a729..8f8715a09e7b3 100644 --- a/docs/source/optimizers.rst +++ b/docs/source/optimizers.rst @@ -5,7 +5,7 @@ Learning rate scheduling ------------------------------------- Every optimizer you use can be paired with any `LearningRateScheduler `_. -.. code-block:: python +.. testcode:: # no LR scheduler def configure_optimizers(self): @@ -44,7 +44,7 @@ Use multiple optimizers (like GANs) ------------------------------------- To use multiple optimizers return > 1 optimizers from :meth:`pytorch_lightning.core.LightningModule.configure_optimizers` -.. code-block:: python +.. testcode:: # one optimizer def configure_optimizers(self): @@ -79,7 +79,7 @@ override the :meth:`optimizer_step` function. For example, here step optimizer A every 2 batches and optimizer B every 4 batches -.. code-block:: python +.. testcode:: def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_i, second_order_closure=None): optimizer.step() @@ -104,7 +104,7 @@ For example, here step optimizer A every 2 batches and optimizer B every 4 batch Here we add a learning-rate warm up -.. code-block:: python +.. testcode:: # learning rate warm-up def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_i, second_order_closure=None): diff --git a/docs/source/sequences.rst b/docs/source/sequences.rst index 63da7e7147af1..857fd08198de8 100644 --- a/docs/source/sequences.rst +++ b/docs/source/sequences.rst @@ -1,3 +1,8 @@ +.. testsetup:: * + + from torch.utils.data import IterableDataset + from pytorch_lightning.trainer.trainer import Trainer + Sequential Data ================ Lightning has built in support for dealing with sequential data. @@ -10,9 +15,9 @@ When using PackedSequence, do 2 things: 1. return either a padded tensor in dataset or a list of variable length tensors in the dataloader collate_fn (example above shows the list implementation). 2. Pack the sequence in forward or training and validation steps depending on use case. -.. code-block:: python +.. testcode:: - # For use in dataloader + # For use in dataloader def collate_fn(batch): x = [item[0] for item in batch] y = [item[1] for item in batch] @@ -30,7 +35,7 @@ For example, it may save memory to use Truncated Backpropagation Through Time wh Lightning can handle TBTT automatically via this flag. -.. code-block:: python +.. testcode:: # DEFAULT (single backwards pass per batch) trainer = Trainer(truncated_bptt_steps=None) @@ -54,7 +59,7 @@ option when using sequential data. This is due to the fact that the IterableDataset does not have a __len__ and Lightning requires this to calculate the validation interval when val_check_interval is less than one. -.. code-block:: python +.. testcode:: # IterableDataset class CustomDataset(IterableDataset): @@ -73,5 +78,7 @@ option when using sequential data. dataloader = DataLoader(dataset=iterable_dataset, batch_size=5) return dataloader +.. testcode:: + # Set val_check_interval - trainer = pl.Trainer() + trainer = Trainer(val_check_interval=100) diff --git a/docs/source/single_gpu.rst b/docs/source/single_gpu.rst index 73908489a720a..c6fa1b9af9bbc 100644 --- a/docs/source/single_gpu.rst +++ b/docs/source/single_gpu.rst @@ -1,9 +1,14 @@ +.. testsetup:: * + + from pytorch_lightning.trainer.trainer import Trainer + Single GPU Training ==================== Make sure you are running on a machine that has at least one GPU. Lightning handles all the NVIDIA flags for you, there's no need to set them yourself. -.. code-block:: python +.. testcode:: + :skipif: torch.cuda.device_count() < 1 # train on 1 GPU (using dp mode) - trainer = pl.Trainer(gpus=1) \ No newline at end of file + trainer = Trainer(gpus=1) \ No newline at end of file diff --git a/docs/source/slurm.rst b/docs/source/slurm.rst index 2bac01b6f0418..ed09e7509b571 100644 --- a/docs/source/slurm.rst +++ b/docs/source/slurm.rst @@ -1,103 +1,107 @@ +.. testsetup:: * + + from pytorch_lightning.trainer.trainer import Trainer + Computing cluster (SLURM) -========================== +========================= Lightning automates job the details behind training on a SLURM powered cluster. .. _multi-node: Multi-node training --------------------- +------------------- To train a model using multiple-nodes do the following: -1. Design your LightningModule. +1. Design your LightningModule. -2. Enable ddp in the trainer +2. Enable ddp in the trainer -.. code-block:: python + .. code-block:: python - # train on 32 GPUs across 4 nodes - trainer = Trainer(gpus=8, num_nodes=4, distributed_backend='ddp') + # train on 32 GPUs across 4 nodes + trainer = Trainer(gpus=8, num_nodes=4, distributed_backend='ddp') -3. It's a good idea to structure your train.py file like this: +3. It's a good idea to structure your train.py file like this: -.. code-block:: python + .. testcode:: - # train.py - def main(hparams): - model = LightningTemplateModel(hparams) + # train.py + def main(hparams): + model = LightningTemplateModel(hparams) - trainer = pl.Trainer( - gpus=8, - num_nodes=4, - distributed_backend='ddp' - ) + trainer = pl.Trainer( + gpus=8, + num_nodes=4, + distributed_backend='ddp' + ) - trainer.fit(model) + trainer.fit(model) - if __name__ == '__main__': - root_dir = os.path.dirname(os.path.realpath(__file__)) - parent_parser = ArgumentParser(add_help=False) - hyperparams = parser.parse_args() + if __name__ == '__main__': + root_dir = os.path.dirname(os.path.realpath(__file__)) + parent_parser = ArgumentParser(add_help=False) + hyperparams = parser.parse_args() - # TRAIN - main(hyperparams) + # TRAIN + main(hyperparams) -4. Create the appropriate SLURM job +4. Create the appropriate SLURM job -.. code-block:: bash + .. code-block:: bash - # (submit.sh) - #!/bin/bash -l + # (submit.sh) + #!/bin/bash -l - # SLURM SUBMIT SCRIPT - #SBATCH --nodes=4 - #SBATCH --gres=gpu:8 - #SBATCH --ntasks-per-node=8 - #SBATCH --mem=0 - #SBATCH --time=0-02:00:00 + # SLURM SUBMIT SCRIPT + #SBATCH --nodes=4 + #SBATCH --gres=gpu:8 + #SBATCH --ntasks-per-node=8 + #SBATCH --mem=0 + #SBATCH --time=0-02:00:00 - # activate conda env - source activate $1 + # activate conda env + source activate $1 - # ------------------------- - # debugging flags (optional) - export NCCL_DEBUG=INFO - export PYTHONFAULTHANDLER=1 + # ------------------------- + # debugging flags (optional) + export NCCL_DEBUG=INFO + export PYTHONFAULTHANDLER=1 - # on your cluster you might need these: - # set the network interface - # export NCCL_SOCKET_IFNAME=^docker0,lo + # on your cluster you might need these: + # set the network interface + # export NCCL_SOCKET_IFNAME=^docker0,lo - # might need the latest cuda - # module load NCCL/2.4.7-1-cuda.10.0 - # ------------------------- + # might need the latest cuda + # module load NCCL/2.4.7-1-cuda.10.0 + # ------------------------- - # run script from above - srun python3 train.py + # run script from above + srun python3 train.py -5. If you want auto-resubmit (read below), add this line to the submit.sh script +5. If you want auto-resubmit (read below), add this line to the submit.sh script -.. code-block:: bash + .. code-block:: bash - #SBATCH --signal=SIGUSR1@90 + #SBATCH --signal=SIGUSR1@90 -6. Submit the SLURM job +6. Submit the SLURM job -.. code-block:: bash + .. code-block:: bash - sbatch submit.sh + sbatch submit.sh .. note:: using :class:`~torch.utils.data.distributed.DistributedSampler` is already handled by Lightning. Walltime auto-resubmit ------------------------------------ +---------------------- When you use Lightning in a SLURM cluster, lightning automatically detects when it is about to run into the walltime, and it does the following: - 1. Saves a temporary checkpoint. - 2. Requeues the job. - 3. When the job starts, it loads the temporary checkpoint. +1. Saves a temporary checkpoint. +2. Requeues the job. +3. When the job starts, it loads the temporary checkpoint. To get this behavior make sure to add the correct signal to your SLURM script diff --git a/docs/source/test_set.rst b/docs/source/test_set.rst index 60a9f9a253cd0..7dfe40ddaa2da 100644 --- a/docs/source/test_set.rst +++ b/docs/source/test_set.rst @@ -1,10 +1,10 @@ Test set -========== +======== Lightning forces the user to run the test set separately to make sure it isn't evaluated by mistake Test after fit ----------------- +-------------- To run the test set after training completes, use this method .. code-block:: python @@ -15,10 +15,9 @@ To run the test set after training completes, use this method # run test set trainer.test() - Test pre-trained model ---------------------- -To run the test set on a pretrained model, use this method. +To run the test set on a pre-trained model, use this method. .. code-block:: python @@ -36,4 +35,4 @@ To run the test set on a pretrained model, use this method. trainer.test(model) In this case, the options you pass to trainer will be used when -running the test set (ie: 16-bit, dp, ddp, etc... \ No newline at end of file +running the test set (ie: 16-bit, dp, ddp, etc...) \ No newline at end of file diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst index a034569d9ec42..e97d7837e0eb4 100644 --- a/docs/source/training_tricks.rst +++ b/docs/source/training_tricks.rst @@ -1,3 +1,8 @@ +.. testsetup:: * + + from pytorch_lightning.trainer.trainer import Trainer + + Training Tricks ================ Lightning implements various tricks to help during training @@ -9,7 +14,7 @@ The effect is a large effective batch size of size KxN. .. seealso:: :class:`~pytorch_lightning.trainer.trainer.Trainer` -.. code-block:: python +.. testcode:: # DEFAULT (ie: no accumulated grads) trainer = Trainer(accumulate_grad_batches=1) @@ -22,7 +27,7 @@ norm `_ .. seealso:: :class:`~pytorch_lightning.trainer.trainer.Trainer` -.. code-block:: python +.. testcode:: # DEFAULT (ie: don't clip) trainer = Trainer(gradient_clip_val=0) diff --git a/docs/source/transfer_learning.rst b/docs/source/transfer_learning.rst index d5a9509f4a014..35b7d661f07c4 100644 --- a/docs/source/transfer_learning.rst +++ b/docs/source/transfer_learning.rst @@ -1,3 +1,7 @@ +.. testsetup:: * + + from pytorch_lightning.core.lightning import LightningModule + Transfer Learning ----------------- @@ -7,22 +11,22 @@ Using Pretrained Models Sometimes we want to use a LightningModule as a pretrained model. This is fine because a LightningModule is just a `torch.nn.Module`! -.. note:: Remember that a pl.LightningModule is EXACTLY a torch.nn.Module but with more capabilities. +.. note:: Remember that a LightningModule is EXACTLY a torch.nn.Module but with more capabilities. Let's use the `AutoEncoder` as a feature extractor in a separate model. -.. code-block:: python +.. testcode:: class Encoder(torch.nn.Module): ... - class AutoEncoder(pl.LightningModule): + class AutoEncoder(LightningModule): def __init__(self): self.encoder = Encoder() self.decoder = Decoder() - class CIFAR10Classifier(pl.LightingModule): + class CIFAR10Classifier(LightningModule): def __init__(self): # init the pretrained LightningModule self.feature_extractor = AutoEncoder.load_from_checkpoint(PATH) @@ -41,15 +45,16 @@ We used our pretrained Autoencoder (a LightningModule) for transfer learning! Example: Imagenet (computer Vision) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code-block:: python +.. testcode:: + :skipif: not TORCHVISION_AVAILABLE import torchvision.models as models - class ImagenetTranferLearning(pl.LightingModule): + class ImagenetTransferLearning(LightningModule): def __init__(self): # init a pretrained resnet num_target_classes = 10 - self.feature_extractor = model.resnet50( + self.feature_extractor = models.resnet50( pretrained=True, num_classes=num_target_classes) self.feature_extractor.eval() @@ -66,7 +71,7 @@ Finetune .. code-block:: python - model = ImagenetTranferLearning() + model = ImagenetTransferLearning() trainer = Trainer() trainer.fit(model) @@ -74,7 +79,7 @@ And use it to predict your data of interest .. code-block:: python - model = ImagenetTranferLearning.load_from_checkpoint(PATH) + model = ImagenetTransferLearning.load_from_checkpoint(PATH) model.freeze() x = some_images_from_cifar10() @@ -90,26 +95,24 @@ as it is a `torch.nn.Module` subclass. Here's a model that uses `Huggingface transformers `_. -.. code-block:: python - - from transformers import BertModel +.. testcode:: - class BertMNLIFinetuner(pl.LightningModule): + class BertMNLIFinetuner(LightningModule): - def __init__(self): - super().__init__() + def __init__(self): + super().__init__() - self.bert = BertModel.from_pretrained('bert-base-cased', output_attentions=True) - self.W = nn.Linear(bert.config.hidden_size, 3) - self.num_classes = 3 + self.bert = BertModel.from_pretrained('bert-base-cased', output_attentions=True) + self.W = nn.Linear(bert.config.hidden_size, 3) + self.num_classes = 3 - def forward(self, input_ids, attention_mask, token_type_ids): + def forward(self, input_ids, attention_mask, token_type_ids): - h, _, attn = self.bert(input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids) + h, _, attn = self.bert(input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids) - h_cls = h[:, 0] - logits = self.W(h_cls) - return logits, attn \ No newline at end of file + h_cls = h[:, 0] + logits = self.W(h_cls) + return logits, attn \ No newline at end of file diff --git a/docs/source/weights_loading.rst b/docs/source/weights_loading.rst index 5f3e4389dd6d0..64a6950738ef1 100644 --- a/docs/source/weights_loading.rst +++ b/docs/source/weights_loading.rst @@ -1,3 +1,10 @@ +.. testsetup:: * + + import os + from pytorch_lightning.trainer.trainer import Trainer + from pytorch_lightning.core.lightning import LightningModule + + Saving and loading weights ========================== @@ -22,13 +29,13 @@ Automatic saving Checkpointing is enabled by default to the current working directory. To change the checkpoint path pass in: -.. code-block:: python +.. testcode:: - Trainer(default_save_path='/your/path/to/save/checkpoints') + trainer = Trainer(default_save_path='/your/path/to/save/checkpoints') To modify the behavior of checkpointing pass in your own callback. -.. code-block:: python +.. testcode:: from pytorch_lightning.callbacks import ModelCheckpoint @@ -47,17 +54,16 @@ To modify the behavior of checkpointing pass in your own callback. Or disable it by passing -.. code-block:: python +.. testcode:: - trainer = Trainer(checkpoint_callback=False) + trainer = Trainer(checkpoint_callback=False) The Lightning checkpoint also saves the hparams (hyperparams) passed into the LightningModule init. .. note:: hparams is a `Namespace `_. -.. code-block:: python - :emphasize-lines: 8 +.. testcode:: from argparse import Namespace @@ -67,9 +73,9 @@ The Lightning checkpoint also saves the hparams (hyperparams) passed into the Li # define you module to have hparams as the first arg # this means your checkpoint will have everything that went into making # this model (in this case, learning rate) - class MyLightningModule(pl.LightningModule): + class MyLightningModule(LightningModule): - def __init__(self, hparams, ...): + def __init__(self, hparams, *args, **kwargs): self.hparams = hparams Manual saving @@ -78,7 +84,7 @@ You can manually save checkpoints and restore your model from the checkpointed s .. code-block:: python - model = MyModel(hparams) + model = MyLightningModule(hparams) trainer.fit(model) trainer.save_checkpoint("example.ckpt") new_model = MyModel.load_from_checkpoint(checkpoint_path="example.ckpt") @@ -96,9 +102,9 @@ To load a model along with its weights, biases and hyperparameters use following The above only works if you used `hparams` in your model definition -.. code-block:: python +.. testcode:: - class MyModel(pl.LightningModule): + class LitModel(LightningModule): def __init__(self, hparams): self.hparams = hparams @@ -106,9 +112,9 @@ The above only works if you used `hparams` in your model definition But if you don't and instead pass individual parameters -.. code-block:: python +.. testcode:: - class MyModel(pl.LightningModule): + class LitModel(LightningModule): def __init__(self, in_dim, out_dim): self.l1 = nn.Linear(in_dim, out_dim) @@ -117,7 +123,7 @@ you can restore the model like this .. code-block:: python - model = MyModel.load_from_checkpoint(PATH, in_dim=128, out_dim=10) + model = LitModel.load_from_checkpoint(PATH, in_dim=128, out_dim=10) Restoring Training State From 2a2f303ae91a4a17b1cd8127e5b811e38cb2d978 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 5 May 2020 18:31:15 +0200 Subject: [PATCH 32/43] Tests: refactor trainer dataloaders (#1690) * refactor default model * drop redundant seeds * refactor dataloaders tests * fix multiple * fix conf * flake8 * Apply suggestions from code review Co-authored-by: William Falcon Co-authored-by: William Falcon --- pytorch_lightning/trainer/data_loading.py | 20 +- tests/base/eval_model_test_dataloaders.py | 8 + tests/base/eval_model_test_epoch_ends.py | 36 +++ tests/base/eval_model_test_steps.py | 4 + tests/base/eval_model_train_dataloaders.py | 11 + tests/base/eval_model_utils.py | 22 ++ tests/base/eval_model_valid_dataloaders.py | 5 + tests/trainer/test_dataloaders.py | 301 +++++++-------------- 8 files changed, 191 insertions(+), 216 deletions(-) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index b3e15024c45de..00b37dfa86dff 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -46,8 +46,8 @@ def _has_len(dataloader: DataLoader) -> bool: try: # try getting the length if len(dataloader) == 0: - raise ValueError('Dataloader returned 0 length. Please make sure' - ' that your Dataloader atleast returns 1 batch') + raise ValueError('`Dataloader` returned 0 length.' + ' Please make sure that your Dataloader at least returns 1 batch') return True except TypeError: return False @@ -186,10 +186,10 @@ def reset_train_dataloader(self, model: LightningModule) -> None: self.val_check_batch = float('inf') else: raise MisconfigurationException( - 'When using an infinite DataLoader (e.g. with an IterableDataset or when ' - 'DataLoader does not implement `__len__`) for `train_dataloader`, ' - '`Trainer(val_check_interval)` must be `1.0` or an int. An int k specifies ' - 'checking validation every k training batches.') + 'When using an infinite DataLoader (e.g. with an IterableDataset' + ' or when DataLoader does not implement `__len__`) for `train_dataloader`,' + ' `Trainer(val_check_interval)` must be `1.0` or an int. An int k specifies' + ' checking validation every k training batches.') else: self._percent_range_check('val_check_interval') @@ -240,9 +240,9 @@ def _reset_eval_dataloader(self, model: LightningModule, mode: str) -> Tuple[int num_batches = int(num_batches * percent_check) elif percent_check not in (0.0, 1.0): raise MisconfigurationException( - 'When using an infinite DataLoader (e.g. with an IterableDataset or when ' - f'DataLoader does not implement `__len__`) for `{mode}_dataloader`, ' - f'`Trainer({mode}_percent_check)` must be `0.0` or `1.0`.') + 'When using an infinite DataLoader (e.g. with an IterableDataset' + f' or when DataLoader does not implement `__len__`) for `{mode}_dataloader`,' + f' `Trainer({mode}_percent_check)` must be `0.0` or `1.0`.') return num_batches, dataloaders def reset_val_dataloader(self, model: LightningModule) -> None: @@ -252,7 +252,7 @@ def reset_val_dataloader(self, model: LightningModule) -> None: model: The current `LightningModule` """ if self.is_overriden('validation_step'): - self.num_val_batches, self.val_dataloaders =\ + self.num_val_batches, self.val_dataloaders = \ self._reset_eval_dataloader(model, 'val') def reset_test_dataloader(self, model) -> None: diff --git a/tests/base/eval_model_test_dataloaders.py b/tests/base/eval_model_test_dataloaders.py index 158b398545588..fdab56994ab9e 100644 --- a/tests/base/eval_model_test_dataloaders.py +++ b/tests/base/eval_model_test_dataloaders.py @@ -1,5 +1,7 @@ from abc import ABC, abstractmethod +from tests.base.eval_model_utils import CustomInfDataloader + class TestDataloaderVariations(ABC): @@ -10,5 +12,11 @@ def dataloader(self, train: bool): def test_dataloader(self): return self.dataloader(train=False) + def test_dataloader__infinite(self): + return CustomInfDataloader(self.dataloader(train=False)) + def test_dataloader__empty(self): return None + + def test_dataloader__multiple(self): + return [self.dataloader(train=False), self.dataloader(train=False)] diff --git a/tests/base/eval_model_test_epoch_ends.py b/tests/base/eval_model_test_epoch_ends.py index 5279e6a9fcab9..fa3c3f7f4a90e 100644 --- a/tests/base/eval_model_test_epoch_ends.py +++ b/tests/base/eval_model_test_epoch_ends.py @@ -37,3 +37,39 @@ def test_epoch_end(self, outputs): metrics_dict = {'test_loss': test_loss_mean.item(), 'test_acc': test_acc_mean.item()} result = {'progress_bar': metrics_dict, 'log': metrics_dict} return result + + def test_epoch_end__multiple_dataloaders(self, outputs): + """ + Called at the end of validation to aggregate outputs + :param outputs: list of individual outputs of each validation step + :return: + """ + # if returned a scalar from test_step, outputs is a list of tensor scalars + # we return just the average in this case (if we want) + # return torch.stack(outputs).mean() + test_loss_mean = 0 + test_acc_mean = 0 + i = 0 + for dl_output in outputs: + for output in dl_output: + test_loss = output['test_loss'] + + # reduce manually when using dp + if self.trainer.use_dp: + test_loss = torch.mean(test_loss) + test_loss_mean += test_loss + + # reduce manually when using dp + test_acc = output['test_acc'] + if self.trainer.use_dp: + test_acc = torch.mean(test_acc) + + test_acc_mean += test_acc + i += 1 + + test_loss_mean /= i + test_acc_mean /= i + + tqdm_dict = {'test_loss': test_loss_mean.item(), 'test_acc': test_acc_mean.item()} + result = {'progress_bar': tqdm_dict} + return result diff --git a/tests/base/eval_model_test_steps.py b/tests/base/eval_model_test_steps.py index b4c80cff06421..bf57c2815bc89 100644 --- a/tests/base/eval_model_test_steps.py +++ b/tests/base/eval_model_test_steps.py @@ -8,6 +8,7 @@ class TestStepVariations(ABC): """ Houses all variations of test steps """ + def test_step(self, batch, batch_idx, *args, **kwargs): """ Default, baseline test_step @@ -87,3 +88,6 @@ def test_step__multiple_dataloaders(self, batch, batch_idx, dataloader_idx, **kw f'test_acc_{dataloader_idx}': test_acc, }) return output + + def test_step__empty(self, batch, batch_idx, *args, **kwargs): + return {} diff --git a/tests/base/eval_model_train_dataloaders.py b/tests/base/eval_model_train_dataloaders.py index 3d547a83639b3..ded46de3d6e41 100644 --- a/tests/base/eval_model_train_dataloaders.py +++ b/tests/base/eval_model_train_dataloaders.py @@ -1,5 +1,7 @@ from abc import ABC, abstractmethod +from tests.base.eval_model_utils import CustomInfDataloader + class TrainDataloaderVariations(ABC): @@ -9,3 +11,12 @@ def dataloader(self, train: bool): def train_dataloader(self): return self.dataloader(train=True) + + def train_dataloader__infinite(self): + return CustomInfDataloader(self.dataloader(train=True)) + + def train_dataloader__zero_length(self): + dataloader = self.dataloader(train=True) + dataloader.dataset.data = dataloader.dataset.data[:0] + dataloader.dataset.targets = dataloader.dataset.targets[:0] + return dataloader diff --git a/tests/base/eval_model_utils.py b/tests/base/eval_model_utils.py index e1a40f95b804f..d3eed3cb8dc5b 100644 --- a/tests/base/eval_model_utils.py +++ b/tests/base/eval_model_utils.py @@ -26,3 +26,25 @@ def get_output_metric(self, output, name): else: # if it is 2level deep -> per dataloader and per batch val = sum(out[name] for out in output) / len(output) return val + + +class CustomInfDataloader: + + def __init__(self, dataloader): + self.dataloader = dataloader + self.iter = iter(dataloader) + self.count = 0 + + def __iter__(self): + self.count = 0 + return self + + def __next__(self): + if self.count >= 50: + raise StopIteration + self.count = self.count + 1 + try: + return next(self.iter) + except StopIteration: + self.iter = iter(self.dataloader) + return next(self.iter) diff --git a/tests/base/eval_model_valid_dataloaders.py b/tests/base/eval_model_valid_dataloaders.py index 72b5afcceee0e..2b760e13086fd 100644 --- a/tests/base/eval_model_valid_dataloaders.py +++ b/tests/base/eval_model_valid_dataloaders.py @@ -1,5 +1,7 @@ from abc import ABC, abstractmethod +from tests.base.eval_model_utils import CustomInfDataloader + class ValDataloaderVariations(ABC): @@ -13,3 +15,6 @@ def val_dataloader(self): def val_dataloader__multiple(self): return [self.dataloader(train=False), self.dataloader(train=False)] + + def val_dataloader__infinite(self): + return CustomInfDataloader(self.dataloader(train=False)) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index d0a6dd869a0ee..c249b834ef8db 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -8,22 +8,7 @@ import tests.base.utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.base import ( - TestModelBase, - LightningTestModel, - LightEmptyTestStep, - LightValidationMultipleDataloadersMixin, - LightTestMultipleDataloadersMixin, - LightTestFitSingleTestDataloadersMixin, - LightTestFitMultipleTestDataloadersMixin, - LightValStepFitMultipleDataloadersMixin, - LightValStepFitSingleDataloaderMixin, - LightTrainDataloader, - LightInfTrainDataloader, - LightInfValDataloader, - LightInfTestDataloader, - LightZeroLenDataloader -) +from tests.base import EvalModelTemplate @pytest.mark.parametrize("dataloader_options", [ @@ -34,14 +19,7 @@ ]) def test_dataloader_config_errors(tmpdir, dataloader_options): - class CurrentTestModel( - LightTrainDataloader, - TestModelBase, - ): - pass - - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) # fit model trainer = Trainer( @@ -57,15 +35,9 @@ class CurrentTestModel( def test_multiple_val_dataloader(tmpdir): """Verify multiple val_dataloader.""" - class CurrentTestModel( - LightTrainDataloader, - LightValidationMultipleDataloadersMixin, - TestModelBase, - ): - pass - - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) + model.val_dataloader = model.val_dataloader__multiple + model.validation_step = model.validation_step__multiple_dataloaders # fit model trainer = Trainer( @@ -91,16 +63,9 @@ class CurrentTestModel( def test_multiple_test_dataloader(tmpdir): """Verify multiple test_dataloader.""" - class CurrentTestModel( - LightTrainDataloader, - LightTestMultipleDataloadersMixin, - LightEmptyTestStep, - TestModelBase, - ): - pass - - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) + model.test_dataloader = model.test_dataloader__multiple + model.test_step = model.test_step__multiple_dataloaders # fit model trainer = Trainer( @@ -127,20 +92,16 @@ class CurrentTestModel( def test_train_dataloader_passed_to_fit(tmpdir): """Verify that train dataloader can be passed to fit """ - class CurrentTestModel(LightTrainDataloader, TestModelBase): - pass - - hparams = tutils.get_default_hparams() - # only train passed to fit - model = CurrentTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, val_percent_check=0.1, train_percent_check=0.2 ) - result = trainer.fit(model, train_dataloader=model._dataloader(train=True)) + fit_options = dict(train_dataloader=model.dataloader(train=True)) + result = trainer.fit(model, **fit_options) assert result == 1 @@ -148,26 +109,18 @@ class CurrentTestModel(LightTrainDataloader, TestModelBase): def test_train_val_dataloaders_passed_to_fit(tmpdir): """ Verify that train & val dataloader can be passed to fit """ - class CurrentTestModel( - LightTrainDataloader, - LightValStepFitSingleDataloaderMixin, - TestModelBase, - ): - pass - - hparams = tutils.get_default_hparams() - # train, val passed to fit - model = CurrentTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, val_percent_check=0.1, train_percent_check=0.2 ) - result = trainer.fit(model, - train_dataloader=model._dataloader(train=True), - val_dataloaders=model._dataloader(train=False)) + fit_options = dict(train_dataloader=model.dataloader(train=True), + val_dataloaders=model.dataloader(train=False)) + + result = trainer.fit(model, **fit_options) assert result == 1 assert len(trainer.val_dataloaders) == 1, \ f'`val_dataloaders` not initiated properly, got {trainer.val_dataloaders}' @@ -176,31 +129,21 @@ class CurrentTestModel( def test_all_dataloaders_passed_to_fit(tmpdir): """Verify train, val & test dataloader(s) can be passed to fit and test method""" - class CurrentTestModel( - LightTrainDataloader, - LightValStepFitSingleDataloaderMixin, - LightTestFitSingleTestDataloadersMixin, - LightEmptyTestStep, - TestModelBase, - ): - pass - - hparams = tutils.get_default_hparams() + model = EvalModelTemplate(tutils.get_default_hparams()) # train, val and test passed to fit - model = CurrentTestModel(hparams) trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, val_percent_check=0.1, train_percent_check=0.2 ) + fit_options = dict(train_dataloader=model.dataloader(train=True), + val_dataloaders=model.dataloader(train=False)) + test_options = dict(test_dataloaders=model.dataloader(train=False)) - result = trainer.fit(model, - train_dataloader=model._dataloader(train=True), - val_dataloaders=model._dataloader(train=False)) - - trainer.test(test_dataloaders=model._dataloader(train=False)) + result = trainer.fit(model, **fit_options) + trainer.test(**test_options) assert result == 1 assert len(trainer.val_dataloaders) == 1, \ @@ -212,32 +155,25 @@ class CurrentTestModel( def test_multiple_dataloaders_passed_to_fit(tmpdir): """Verify that multiple val & test dataloaders can be passed to fit.""" - class CurrentTestModel( - LightningTestModel, - LightValStepFitMultipleDataloadersMixin, - LightTestFitMultipleTestDataloadersMixin, - ): - pass - - hparams = tutils.get_default_hparams() + model = EvalModelTemplate(tutils.get_default_hparams()) + model.validation_step = model.validation_step__multiple_dataloaders + model.test_step = model.test_step__multiple_dataloaders # train, multiple val and multiple test passed to fit - model = CurrentTestModel(hparams) trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, val_percent_check=0.1, train_percent_check=0.2 ) + fit_options = dict(train_dataloader=model.dataloader(train=True), + val_dataloaders=[model.dataloader(train=False), + model.dataloader(train=False)]) + test_options = dict(test_dataloaders=[model.dataloader(train=False), + model.dataloader(train=False)]) - results = trainer.fit( - model, - train_dataloader=model._dataloader(train=True), - val_dataloaders=[model._dataloader(train=False), model._dataloader(train=False)], - ) - assert results - - trainer.test(test_dataloaders=[model._dataloader(train=False), model._dataloader(train=False)]) + trainer.fit(model, **fit_options) + trainer.test(**test_options) assert len(trainer.val_dataloaders) == 2, \ f'Multiple `val_dataloaders` not initiated properly, got {trainer.val_dataloaders}' @@ -248,16 +184,7 @@ class CurrentTestModel( def test_mixing_of_dataloader_options(tmpdir): """Verify that dataloaders can be passed to fit""" - class CurrentTestModel( - LightTrainDataloader, - LightValStepFitSingleDataloaderMixin, - LightTestFitSingleTestDataloadersMixin, - TestModelBase, - ): - pass - - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) trainer_options = dict( default_root_dir=tmpdir, @@ -268,17 +195,14 @@ class CurrentTestModel( # fit model trainer = Trainer(**trainer_options) - fit_options = dict(val_dataloaders=model._dataloader(train=False)) - results = trainer.fit(model, **fit_options) + results = trainer.fit(model, val_dataloaders=model.dataloader(train=False)) assert results # fit model trainer = Trainer(**trainer_options) - fit_options = dict(val_dataloaders=model._dataloader(train=False)) - test_options = dict(test_dataloaders=model._dataloader(train=False)) - - _ = trainer.fit(model, **fit_options) - trainer.test(**test_options) + results = trainer.fit(model, val_dataloaders=model.dataloader(train=False)) + assert results + trainer.test(test_dataloaders=model.dataloader(train=False)) assert len(trainer.val_dataloaders) == 1, \ f'`val_dataloaders` not initiated properly, got {trainer.val_dataloaders}' @@ -286,72 +210,68 @@ class CurrentTestModel( f'`test_dataloaders` not initiated properly, got {trainer.test_dataloaders}' -def test_inf_train_dataloader(tmpdir): +def test_train_inf_dataloader_error(tmpdir): """Test inf train data loader (e.g. IterableDataset)""" + model = EvalModelTemplate(tutils.get_default_hparams()) + model.train_dataloader = model.train_dataloader__infinite - class CurrentTestModel( - LightInfTrainDataloader, - LightningTestModel - ): - pass + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, val_check_interval=0.5) - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + with pytest.raises(MisconfigurationException, match='infinite DataLoader'): + trainer.fit(model) - # fit model - with pytest.raises(MisconfigurationException): - trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=1, - val_check_interval=0.5 - ) + +def test_val_inf_dataloader_error(tmpdir): + """Test inf train data loader (e.g. IterableDataset)""" + model = EvalModelTemplate(tutils.get_default_hparams()) + model.val_dataloader = model.val_dataloader__infinite + + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, val_percent_check=0.5) + + with pytest.raises(MisconfigurationException, match='infinite DataLoader'): trainer.fit(model) - trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=1, - val_check_interval=50 - ) - result = trainer.fit(model) - # verify training completed - assert result == 1 +def test_test_inf_dataloader_error(tmpdir): + """Test inf train data loader (e.g. IterableDataset)""" + model = EvalModelTemplate(tutils.get_default_hparams()) + model.test_dataloader = model.test_dataloader__infinite + + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, test_percent_check=0.5) + + with pytest.raises(MisconfigurationException, match='infinite DataLoader'): + trainer.test(model) + + +@pytest.mark.parametrize('check_interval', [50, 1.0]) +def test_inf_train_dataloader(tmpdir, check_interval): + """Test inf train data loader (e.g. IterableDataset)""" + + model = EvalModelTemplate(tutils.get_default_hparams()) + model.train_dataloader = model.train_dataloader__infinite trainer = Trainer( default_root_dir=tmpdir, - max_epochs=1 + max_epochs=1, + train_check_interval=check_interval, ) result = trainer.fit(model) - # verify training completed assert result == 1 -def test_inf_val_dataloader(tmpdir): +@pytest.mark.parametrize('check_interval', [1.0]) +def test_inf_val_dataloader(tmpdir, check_interval): """Test inf val data loader (e.g. IterableDataset)""" - class CurrentTestModel( - LightInfValDataloader, - LightningTestModel - ): - pass - - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) - - # fit model - with pytest.raises(MisconfigurationException): - trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=1, - val_percent_check=0.5 - ) - trainer.fit(model) + model = EvalModelTemplate(tutils.get_default_hparams()) + model.val_dataloader = model.val_dataloader__infinite # logger file to get meta trainer = Trainer( default_root_dir=tmpdir, - max_epochs=1 + max_epochs=1, + val_check_interval=check_interval, ) result = trainer.fit(model) @@ -359,35 +279,20 @@ class CurrentTestModel( assert result == 1 -def test_inf_test_dataloader(tmpdir): +@pytest.mark.parametrize('check_interval', [50, 1.0]) +def test_inf_test_dataloader(tmpdir, check_interval): """Test inf test data loader (e.g. IterableDataset)""" - class CurrentTestModel( - LightInfTestDataloader, - LightningTestModel, - LightTestFitSingleTestDataloadersMixin - ): - pass - - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) - - # fit model - with pytest.raises(MisconfigurationException): - trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=1, - test_percent_check=0.5 - ) - trainer.test(model) + model = EvalModelTemplate(tutils.get_default_hparams()) + model.test_dataloader = model.test_dataloader__infinite # logger file to get meta trainer = Trainer( default_root_dir=tmpdir, - max_epochs=1 + max_epochs=1, + test_check_interval=check_interval, ) result = trainer.fit(model) - trainer.test(model) # verify training completed assert result == 1 @@ -396,14 +301,8 @@ class CurrentTestModel( def test_error_on_zero_len_dataloader(tmpdir): """ Test that error is raised if a zero-length dataloader is defined """ - class CurrentTestModel( - LightZeroLenDataloader, - LightningTestModel - ): - pass - - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) + model = EvalModelTemplate(tutils.get_default_hparams()) + model.train_dataloader = model.train_dataloader__zero_length # fit model with pytest.raises(ValueError): @@ -419,29 +318,22 @@ class CurrentTestModel( def test_warning_with_few_workers(tmpdir): """ Test that error is raised if dataloader with only a few workers is used """ - class CurrentTestModel( - LightTrainDataloader, - LightValStepFitSingleDataloaderMixin, - LightTestFitSingleTestDataloadersMixin, - LightEmptyTestStep, - TestModelBase, - ): - pass - - hparams = tutils.get_default_hparams() - model = CurrentTestModel(hparams) - - fit_options = dict(train_dataloader=model._dataloader(train=True), - val_dataloaders=model._dataloader(train=False)) - test_options = dict(test_dataloaders=model._dataloader(train=False)) + model = EvalModelTemplate(tutils.get_default_hparams()) - trainer = Trainer( + # logger file to get meta + trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, val_percent_check=0.1, train_percent_check=0.2 ) + fit_options = dict(train_dataloader=model.dataloader(train=True), + val_dataloaders=model.dataloader(train=False)) + test_options = dict(test_dataloaders=model.dataloader(train=False)) + + trainer = Trainer(**trainer_options) + # fit model with pytest.warns(UserWarning, match='train'): trainer.fit(model, **fit_options) @@ -491,10 +383,7 @@ def test_batch_size_smaller_than_num_gpus(): num_gpus = 3 batch_size = 3 - class CurrentTestModel( - LightTrainDataloader, - TestModelBase, - ): + class CurrentTestModel(EvalModelTemplate): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) From d6a0375974da1cbdf44c1a6c4ca4ee89d9f3496b Mon Sep 17 00:00:00 2001 From: Tian Wang Date: Wed, 6 May 2020 02:07:26 +0800 Subject: [PATCH 33/43] Fixing logic (#1734) --- pytorch_lightning/trainer/training_loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 37bac3d99727f..b2ce8599bc9a0 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -325,7 +325,7 @@ def train(self): if self.reload_dataloaders_every_epoch: self.reset_train_dataloader(model) # set seed for distributed sampler (enables shuffling for each epoch) - if self.use_ddp or self.use_horovod \ + if (self.use_ddp or self.use_horovod) \ and hasattr(self.train_dataloader.sampler, 'set_epoch'): self.train_dataloader.sampler.set_epoch(epoch) From 2b03d34931c5d4b2acfdf3156877dfca1dd8001a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 5 May 2020 20:08:15 +0200 Subject: [PATCH 34/43] complete test (#1705) --- tests/trainer/test_dataloaders.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index c249b834ef8db..92704a9040a9e 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -387,8 +387,17 @@ class CurrentTestModel(EvalModelTemplate): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + # batch norm doesn't work with batch size 1, we replace it self.c_d1_bn = torch.nn.ReLU() + def training_step(self, *args, **kwargs): + output = super().training_step(*args, **kwargs) + loss = output['loss'] + # we make sure to add some metrics to the output dict, + # this is essential for this test + output['progress_bar'] = {'train_loss': loss} + return output + def train_dataloader(self): dataloader = super().train_dataloader() # construct a dataset with a size that is not divisible by num_gpus @@ -408,6 +417,7 @@ def train_dataloader(self): trainer = Trainer( max_epochs=1, + val_percent_check=0, gpus=num_gpus, ) From fc7f5919b5ab487ad399b4ed13b49ceb2043cdd0 Mon Sep 17 00:00:00 2001 From: Jeremy Jordan <13970565+jeremyjordan@users.noreply.github.com> Date: Tue, 5 May 2020 14:08:54 -0400 Subject: [PATCH 35/43] improve pickle tests for callbacks (#1717) * improve pickle tests for callbacks * set mode dict as a class attr --- pytorch_lightning/callbacks/early_stopping.py | 28 +++++++++---------- tests/callbacks/test_callbacks.py | 10 +++++-- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py index d383a2fb42d7d..100c317172044 100644 --- a/pytorch_lightning/callbacks/early_stopping.py +++ b/pytorch_lightning/callbacks/early_stopping.py @@ -45,11 +45,14 @@ class EarlyStopping(Callback): >>> early_stopping = EarlyStopping('val_loss') >>> trainer = Trainer(early_stop_callback=early_stopping) """ + mode_dict = { + 'min': torch.lt, + 'max': torch.gt, + } def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience: int = 3, verbose: bool = False, mode: str = 'auto', strict: bool = True): super().__init__() - self.monitor = monitor self.patience = patience self.verbose = verbose @@ -59,17 +62,19 @@ def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience: self.stopped_epoch = 0 self.mode = mode - mode_dict = { - 'min': torch.lt, - 'max': torch.gt, - 'auto': torch.gt if 'acc' in self.monitor else torch.lt - } - - if mode not in mode_dict: + if mode not in self.mode_dict: if self.verbose > 0: log.info(f'EarlyStopping mode {mode} is unknown, fallback to auto mode.') self.mode = 'auto' + if self.mode == 'auto': + if self.monitor == 'acc': + self.mode = 'max' + else: + self.mode = 'min' + if self.verbose > 0: + log.info(f'EarlyStopping mode set to {self.mode} for monitoring {self.monitor}.') + self.min_delta *= 1 if self.monitor_op == torch.gt else -1 def _validate_condition_metric(self, logs): @@ -96,12 +101,7 @@ def _validate_condition_metric(self, logs): @property def monitor_op(self): - mode_dict = { - 'min': torch.lt, - 'max': torch.gt, - 'auto': torch.gt if 'acc' in self.monitor else torch.lt - } - return mode_dict[self.mode] + return self.mode_dict[self.mode] def on_train_start(self, trainer, pl_module): # Allow instances to be re-used diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index 32fdcdaeb51a1..8a50cb667c4bf 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -229,8 +229,14 @@ def test_pickling(tmpdir): early_stopping = EarlyStopping() ckpt = ModelCheckpoint(tmpdir) - pickle.dumps(ckpt) - pickle.dumps(early_stopping) + early_stopping_pickled = pickle.dumps(early_stopping) + ckpt_pickled = pickle.dumps(ckpt) + + early_stopping_loaded = pickle.loads(early_stopping_pickled) + ckpt_loaded = pickle.loads(ckpt_pickled) + + assert vars(early_stopping) == vars(early_stopping_loaded) + assert vars(ckpt) == vars(ckpt_loaded) @pytest.mark.parametrize('save_top_k', [-1, 0, 1, 2]) From 35bbe178bd3a0c60331641e1698dc3940419c533 Mon Sep 17 00:00:00 2001 From: Yuri Brovman Date: Tue, 5 May 2020 14:09:48 -0400 Subject: [PATCH 36/43] fix _reset_eval_dataloader() for IterableDataset (#1560) * removed if dl from _reset_eval_dataloader() * changed to if dl != None to be more safe * hints from pep8speaks Co-authored-by: ybrovman --- pytorch_lightning/trainer/data_loading.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 00b37dfa86dff..52e53acd5a2b4 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -219,7 +219,7 @@ def _reset_eval_dataloader(self, model: LightningModule, mode: str) -> Tuple[int ' this off for validation and test dataloaders.') # add samplers - dataloaders = [self.auto_add_sampler(dl, train=False) for dl in dataloaders if dl] + dataloaders = [self.auto_add_sampler(dl, train=False) for dl in dataloaders if dl is not None] num_batches = 0 From 0cb58fbb4cd38142636a52f34fdb948ab45b7043 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 5 May 2020 22:48:45 +0200 Subject: [PATCH 37/43] Mock packages for RTD docs build (follow up to doctests) (#1739) * mock all packages on RTD * update --- docs/source/conf.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index f6dad2c3922ea..7ca48bd19cbf3 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -29,6 +29,8 @@ builtins.__LIGHTNING_SETUP__ = True +IS_READTHEDOCS_BUILD = os.environ.get('READTHEDOCS', False) + import pytorch_lightning # noqa: E402 # -- Project documents ------------------------------------------------------- @@ -305,23 +307,31 @@ def setup(app): path_ipynb2 = os.path.join(path_nbs, os.path.basename(path_ipynb)) shutil.copy(path_ipynb, path_ipynb2) + # Ignoring Third-party packages # https://stackoverflow.com/questions/15889621/sphinx-how-to-exclude-imports-in-automodule +def package_list_from_file(file): + mocked_packages = [] + with open(file, 'r') as fp: + for ln in fp.readlines(): + found = [ln.index(ch) for ch in list(',=<>#') if ch in ln] + pkg = ln[:min(found)] if found else ln + if pkg.rstrip(): + mocked_packages.append(pkg.rstrip()) + return mocked_packages + + +MOCK_PACKAGES = package_list_from_file(os.path.join(PATH_ROOT, 'requirements-extra.txt')) +if IS_READTHEDOCS_BUILD: + # mock also base packages when we are on RTD since we don't install them there + base_packages = package_list_from_file(os.path.join(PATH_ROOT, 'requirements.txt')) + MOCK_PACKAGES.extend(base_packages) -MOCK_REQUIRE_PACKAGES = [] -with open(os.path.join(PATH_ROOT, 'requirements-extra.txt'), 'r') as fp: - for ln in fp.readlines(): - found = [ln.index(ch) for ch in list(',=<>#') if ch in ln] - pkg = ln[:min(found)] if found else ln - if pkg.rstrip(): - MOCK_REQUIRE_PACKAGES.append(pkg.rstrip()) - -# TODO: better parse from package since the import name and package name may differ MOCK_MANUAL_PACKAGES = [ 'torchvision', 'PIL', ] -autodoc_mock_imports = MOCK_REQUIRE_PACKAGES + MOCK_MANUAL_PACKAGES +autodoc_mock_imports = MOCK_PACKAGES + MOCK_MANUAL_PACKAGES # Options for the linkcode extension From 851866333c467ea0bd5e828d6e47610f90eff9a4 Mon Sep 17 00:00:00 2001 From: Peter Yu <2057325+yukw777@users.noreply.github.com> Date: Wed, 6 May 2020 12:38:32 -0400 Subject: [PATCH 38/43] Attach version_ to checkpoint path only if version is int (#1748) --- CHANGELOG.md | 4 +++- pytorch_lightning/trainer/callback_config.py | 4 +++- tests/callbacks/test_callbacks.py | 24 ++++++++++++++++++++ 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bda687d2bbfca..f3190f9d7353d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added type hints in `Trainer.fit()` and `Trainer.test()` to reflect that also a list of dataloaders can be passed in ([#1723](https://github.com/PyTorchLightning/pytorch-lightning/pull/1723)). ### Changed - + - Reduction when `batch_size < num_gpus` ([#1609](https://github.com/PyTorchLightning/pytorch-lightning/pull/1609)) - Updated LightningTemplateModel to look more like Colab example ([#1577](https://github.com/PyTorchLightning/pytorch-lightning/pull/1577)) @@ -38,6 +38,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed bugs that prevent lr finder to be used together with early stopping and validation dataloaders ([#1676](https://github.com/PyTorchLightning/pytorch-lightning/pull/1676)) +- Fixed a bug in Trainer that prepended the checkpoint path with `version_` when it shouldn't ([#1748](https://github.com/PyTorchLightning/pytorch-lightning/pull/1748)) + ## [0.7.5] - 2020-04-27 ### Changed diff --git a/pytorch_lightning/trainer/callback_config.py b/pytorch_lightning/trainer/callback_config.py index 39c67963169e4..a760b9760209c 100644 --- a/pytorch_lightning/trainer/callback_config.py +++ b/pytorch_lightning/trainer/callback_config.py @@ -49,10 +49,12 @@ def configure_checkpoint_callback(self): if self.weights_save_path is not None: save_dir = self.weights_save_path + version = self.logger.version if isinstance( + self.logger.version, str) else f'version_{self.logger.version}' ckpt_path = os.path.join( save_dir, self.logger.name, - f'version_{self.logger.version}', + version, "checkpoints" ) else: diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index 8a50cb667c4bf..884fc82e13e28 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -3,7 +3,9 @@ from pytorch_lightning import Callback from pytorch_lightning import Trainer, LightningModule from pytorch_lightning.callbacks import EarlyStopping, LearningRateLogger, ModelCheckpoint +from pytorch_lightning.loggers import TensorBoardLogger from tests.base import EvalModelTemplate +from pathlib import Path def test_trainer_callback_system(tmpdir): @@ -258,6 +260,28 @@ def test_model_checkpoint_with_non_string_input(tmpdir, save_top_k): assert trainer.ckpt_path != trainer.default_root_dir +@pytest.mark.parametrize( + 'logger_version,expected', + [(None, 'version_0'), (1, 'version_1'), ('awesome', 'awesome')], +) +def test_model_checkpoint_path(tmpdir, logger_version, expected): + """Test that "version_" prefix is only added when logger's version is an integer""" + tutils.reset_seed() + model = EvalModelTemplate(tutils.get_default_hparams()) + logger = TensorBoardLogger(str(tmpdir), version=logger_version) + + trainer = Trainer( + default_root_dir=tmpdir, + overfit_pct=0.2, + max_epochs=5, + logger=logger + ) + trainer.fit(model) + + ckpt_version = Path(trainer.ckpt_path).parent.name + assert ckpt_version == expected + + def test_lr_logger_single_lr(tmpdir): """ Test that learning rates are extracted and logged for single lr scheduler""" tutils.reset_seed() From b9364f96b1165bf79340f4cf3690b9da84d7001c Mon Sep 17 00:00:00 2001 From: Pavel Grunt Date: Wed, 6 May 2020 18:39:22 +0200 Subject: [PATCH 39/43] lr_finder: Fix typo in docstring (#1746) --- pytorch_lightning/trainer/lr_finder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/lr_finder.py b/pytorch_lightning/trainer/lr_finder.py index e664fd3cc47d0..5e8c7a862a65e 100755 --- a/pytorch_lightning/trainer/lr_finder.py +++ b/pytorch_lightning/trainer/lr_finder.py @@ -94,7 +94,7 @@ def lr_find(self, # Inspect results fig = lr_finder.plot(); fig.show() - suggested_lr = lr_finder.suggest() + suggested_lr = lr_finder.suggestion() # Overwrite lr and create new model hparams.lr = suggested_lr From f656882942c521154c77607e6ab02b4585893b89 Mon Sep 17 00:00:00 2001 From: Shunta Komatsu <59395084+skmatz@users.noreply.github.com> Date: Thu, 7 May 2020 22:25:54 +0900 Subject: [PATCH 40/43] Fix typo (#1750) --- .github/workflows/docs-check.yml | 2 +- docs/source/lr_finder.rst | 2 +- .../domain_templates/reinforce_learn_Qnet.py | 2 +- pytorch_lightning/core/lightning.py | 2 +- pytorch_lightning/loggers/neptune.py | 2 +- pytorch_lightning/trainer/callback_config.py | 2 +- pytorch_lightning/trainer/data_loading.py | 6 ++-- pytorch_lightning/trainer/evaluation_loop.py | 16 +++++------ pytorch_lightning/trainer/lr_finder.py | 2 +- pytorch_lightning/trainer/model_hooks.py | 8 +++--- pytorch_lightning/trainer/optimizers.py | 2 +- pytorch_lightning/trainer/trainer.py | 28 +++++++++---------- pytorch_lightning/trainer/training_loop.py | 18 ++++++------ setup.py | 4 +-- tests/models/test_gpu.py | 4 +-- tests/trainer/test_checks.py | 12 ++++---- 16 files changed, 56 insertions(+), 56 deletions(-) diff --git a/.github/workflows/docs-check.yml b/.github/workflows/docs-check.yml index cd49a01e8e569..bc8717a2f0f48 100644 --- a/.github/workflows/docs-check.yml +++ b/.github/workflows/docs-check.yml @@ -11,7 +11,7 @@ jobs: - uses: actions/checkout@v2 - uses: ammaraskar/sphinx-action@master with: - # git is requried to clone the docs theme + # git is required to clone the docs theme pre-build-command: "apt-get update -y && apt-get install -y git" docs-folder: "docs/" repo-token: "${{ secrets.GITHUB_TOKEN }}" diff --git a/docs/source/lr_finder.rst b/docs/source/lr_finder.rst index 3da5456b6de8b..c20472bfc075c 100755 --- a/docs/source/lr_finder.rst +++ b/docs/source/lr_finder.rst @@ -15,7 +15,7 @@ To reduce the amount of guesswork concerning choosing a good initial learning rate, a `learning rate finder` can be used. As described in this `paper `_ a learning rate finder does a small run where the learning rate is increased after each processed batch and the corresponding loss is logged. The result of -this is a `lr` vs. `loss` plot that can be used as guidence for choosing a optimal +this is a `lr` vs. `loss` plot that can be used as guidance for choosing a optimal initial lr. .. warning:: For the moment, this feature only works with models having a single optimizer. diff --git a/pl_examples/domain_templates/reinforce_learn_Qnet.py b/pl_examples/domain_templates/reinforce_learn_Qnet.py index 4301957afa58d..ff3f634da7817 100644 --- a/pl_examples/domain_templates/reinforce_learn_Qnet.py +++ b/pl_examples/domain_templates/reinforce_learn_Qnet.py @@ -257,7 +257,7 @@ def dqn_mse_loss(self, batch: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], nb_batch) -> OrderedDict: """ Carries out a single step through the environment to update the replay buffer. - Then calculates loss based on the minibatch recieved + Then calculates loss based on the minibatch received Args: batch: current mini batch of replay data diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index a534929434a8b..1b906c0f9089a 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -875,7 +875,7 @@ def configure_ddp(self, model, device_ids): def _init_slurm_connection(self) -> None: """ - Sets up environemnt variables necessary for pytorch distributed communications + Sets up environment variables necessary for pytorch distributed communications based on slurm environment. """ # use slurm job id for the port number diff --git a/pytorch_lightning/loggers/neptune.py b/pytorch_lightning/loggers/neptune.py index 374b513b618e7..df1cfeb78f05b 100644 --- a/pytorch_lightning/loggers/neptune.py +++ b/pytorch_lightning/loggers/neptune.py @@ -31,7 +31,7 @@ class NeptuneLogger(LightningLoggerBase): pip install neptune-client The Neptune logger can be used in the online mode or offline (silent) mode. - To log experiment data in online mode, :class:`NeptuneLogger` requries an API key. + To log experiment data in online mode, :class:`NeptuneLogger` requires an API key. In offline mode, Neptune will log to a local directory. **ONLINE MODE** diff --git a/pytorch_lightning/trainer/callback_config.py b/pytorch_lightning/trainer/callback_config.py index a760b9760209c..551d085eb444d 100644 --- a/pytorch_lightning/trainer/callback_config.py +++ b/pytorch_lightning/trainer/callback_config.py @@ -61,7 +61,7 @@ def configure_checkpoint_callback(self): ckpt_path = os.path.join(self.default_root_dir, "checkpoints") # when no val step is defined, use 'loss' otherwise 'val_loss' - train_step_only = not self.is_overriden('validation_step') + train_step_only = not self.is_overridden('validation_step') monitor_key = 'loss' if train_step_only else 'val_loss' if self.checkpoint_callback is True: diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 52e53acd5a2b4..0428f4951e596 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -78,7 +78,7 @@ class TrainerDataLoadingMixin(ABC): replace_sampler_ddp: bool @abstractmethod - def is_overriden(self, *args): + def is_overridden(self, *args): """Warning: this is just empty shell for code implemented in other class.""" def _percent_range_check(self, name: str) -> None: @@ -251,7 +251,7 @@ def reset_val_dataloader(self, model: LightningModule) -> None: Args: model: The current `LightningModule` """ - if self.is_overriden('validation_step'): + if self.is_overridden('validation_step'): self.num_val_batches, self.val_dataloaders = \ self._reset_eval_dataloader(model, 'val') @@ -261,7 +261,7 @@ def reset_test_dataloader(self, model) -> None: Args: model: The current `LightningModule` """ - if self.is_overriden('test_step'): + if self.is_overridden('test_step'): self.num_test_batches, self.test_dataloaders =\ self._reset_eval_dataloader(model, 'test') diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index 0320bf35419ea..b80066f40ff7d 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -195,7 +195,7 @@ def get_model(self): """Warning: this is just empty shell for code implemented in other class.""" @abstractmethod - def is_overriden(self, *args): + def is_overridden(self, *args): """Warning: this is just empty shell for code implemented in other class.""" @abstractmethod @@ -279,13 +279,13 @@ def _evaluate(self, model: LightningModule, dataloaders, max_batches: int, test_ # on dp / ddp2 might still want to do something with the batch parts if test_mode: - if self.is_overriden('test_step_end'): + if self.is_overridden('test_step_end'): model_ref = self.get_model() with self.profiler.profile('test_step_end'): output = model_ref.test_step_end(output) self.on_test_batch_end() else: - if self.is_overriden('validation_step_end'): + if self.is_overridden('validation_step_end'): model_ref = self.get_model() with self.profiler.profile('validation_step_end'): output = model_ref.validation_step_end(output) @@ -307,23 +307,23 @@ def _evaluate(self, model: LightningModule, dataloaders, max_batches: int, test_ model = model.module if test_mode: - if self.is_overriden('test_end', model=model): + if self.is_overridden('test_end', model=model): # TODO: remove in v1.0.0 eval_results = model.test_end(outputs) rank_zero_warn('Method `test_end` was deprecated in v0.7 and will be removed v1.0.' ' Use `test_epoch_end` instead.', DeprecationWarning) - elif self.is_overriden('test_epoch_end', model=model): + elif self.is_overridden('test_epoch_end', model=model): eval_results = model.test_epoch_end(outputs) else: - if self.is_overriden('validation_end', model=model): + if self.is_overridden('validation_end', model=model): # TODO: remove in v1.0.0 eval_results = model.validation_end(outputs) rank_zero_warn('Method `validation_end` was deprecated in v0.7 and will be removed v1.0.' ' Use `validation_epoch_end` instead.', DeprecationWarning) - elif self.is_overriden('validation_epoch_end', model=model): + elif self.is_overridden('validation_epoch_end', model=model): eval_results = model.validation_epoch_end(outputs) # enable train mode again @@ -336,7 +336,7 @@ def _evaluate(self, model: LightningModule, dataloaders, max_batches: int, test_ def run_evaluation(self, test_mode: bool = False): # when testing make sure user defined a test step - if test_mode and not self.is_overriden('test_step'): + if test_mode and not self.is_overridden('test_step'): raise MisconfigurationException( "You called `.test()` without defining model's `.test_step()`." " Please define and try again") diff --git a/pytorch_lightning/trainer/lr_finder.py b/pytorch_lightning/trainer/lr_finder.py index 5e8c7a862a65e..89b1c886a1db9 100755 --- a/pytorch_lightning/trainer/lr_finder.py +++ b/pytorch_lightning/trainer/lr_finder.py @@ -214,7 +214,7 @@ class _LRFinder(object): lr_min: lr to start search from - lr_max: lr to stop seach + lr_max: lr to stop search num_training: number of steps to take between lr_min and lr_max diff --git a/pytorch_lightning/trainer/model_hooks.py b/pytorch_lightning/trainer/model_hooks.py index fa9dfed7da02f..701119d736be8 100644 --- a/pytorch_lightning/trainer/model_hooks.py +++ b/pytorch_lightning/trainer/model_hooks.py @@ -11,7 +11,7 @@ def is_function_implemented(self, f_name): f_op = getattr(model, f_name, None) return callable(f_op) - def is_overriden(self, method_name: str, model: LightningModule = None) -> bool: + def is_overridden(self, method_name: str, model: LightningModule = None) -> bool: if model is None: model = self.get_model() super_object = LightningModule @@ -30,10 +30,10 @@ def is_overriden(self, method_name: str, model: LightningModule = None) -> bool: # cannot pickle __code__ so cannot verify if PatchDataloader # exists which shows dataloader methods have been overwritten. # so, we hack it by using the string representation - is_overriden = instance_attr.patch_loader_code != str(super_attr.__code__) + is_overridden = instance_attr.patch_loader_code != str(super_attr.__code__) else: - is_overriden = instance_attr.__code__ is not super_attr.__code__ - return is_overriden + is_overridden = instance_attr.__code__ is not super_attr.__code__ + return is_overridden def has_arg(self, f_name, arg_name): model = self.get_model() diff --git a/pytorch_lightning/trainer/optimizers.py b/pytorch_lightning/trainer/optimizers.py index 8dd77f7971a48..ccabcddba72ac 100644 --- a/pytorch_lightning/trainer/optimizers.py +++ b/pytorch_lightning/trainer/optimizers.py @@ -81,7 +81,7 @@ def init_optimizers( ' * multiple outputs, dictionaries as described with an optional `frequency` key (int)') def configure_schedulers(self, schedulers: list): - # Convert each scheduler into dict sturcture with relevant information + # Convert each scheduler into dict structure with relevant information lr_schedulers = [] default_config = {'interval': 'epoch', # default every epoch 'frequency': 1, # default every epoch/batch diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 0353fae2bff7f..56df5ad3a9abf 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -193,7 +193,7 @@ def __init__( show_progress_bar: .. warning:: .. deprecated:: 0.7.2 - Set `progress_bar_refresh_rate` to postive integer to enable. Will remove 0.9.0. + Set `progress_bar_refresh_rate` to positive integer to enable. Will remove 0.9.0. progress_bar_refresh_rate: How often to refresh progress bar (in steps). Value ``0`` disables progress bar. Ignored when a custom callback is passed to :paramref:`~Trainer.callbacks`. @@ -893,7 +893,7 @@ def run_pretrain_routine(self, model: LightningModule): return # check if we should run validation during training - self.disable_validation = not (self.is_overriden('validation_step') and self.val_percent_check > 0) \ + self.disable_validation = not (self.is_overridden('validation_step') and self.val_percent_check > 0) \ and not self.fast_dev_run # run tiny validation (if validation defined) @@ -994,45 +994,45 @@ def check_model_configuration(self, model: LightningModule): """ # Check training_step, train_dataloader, configure_optimizer methods - if not self.is_overriden('training_step', model): + if not self.is_overridden('training_step', model): raise MisconfigurationException( 'No `training_step()` method defined. Lightning `Trainer` expects as minimum a' ' `training_step()`, `training_dataloader()` and `configure_optimizers()` to be defined.') - if not self.is_overriden('train_dataloader', model): + if not self.is_overridden('train_dataloader', model): raise MisconfigurationException( 'No `train_dataloader()` method defined. Lightning `Trainer` expects as minimum a' ' `training_step()`, `training_dataloader()` and `configure_optimizers()` to be defined.') - if not self.is_overriden('configure_optimizers', model): + if not self.is_overridden('configure_optimizers', model): raise MisconfigurationException( 'No `configure_optimizers()` method defined. Lightning `Trainer` expects as minimum a' ' `training_step()`, `training_dataloader()` and `configure_optimizers()` to be defined.') # Check val_dataloader, validation_step and validation_epoch_end - if self.is_overriden('val_dataloader', model): - if not self.is_overriden('validation_step', model): + if self.is_overridden('val_dataloader', model): + if not self.is_overridden('validation_step', model): raise MisconfigurationException('You have passed in a `val_dataloader()`' ' but have not defined `validation_step()`.') else: - if not self.is_overriden('validation_epoch_end', model): + if not self.is_overridden('validation_epoch_end', model): rank_zero_warn( 'You have defined a `val_dataloader()` and have defined a `validation_step()`,' ' you may also want to define `validation_epoch_end()` for accumulating stats.', RuntimeWarning ) else: - if self.is_overriden('validation_step', model): + if self.is_overridden('validation_step', model): raise MisconfigurationException('You have defined `validation_step()`,' ' but have not passed in a val_dataloader().') # Check test_dataloader, test_step and test_epoch_end - if self.is_overriden('test_dataloader', model): - if not self.is_overriden('test_step', model): + if self.is_overridden('test_dataloader', model): + if not self.is_overridden('test_step', model): raise MisconfigurationException('You have passed in a `test_dataloader()`' ' but have not defined `test_step()`.') else: - if not self.is_overriden('test_epoch_end', model): + if not self.is_overridden('test_epoch_end', model): rank_zero_warn( 'You have defined a `test_dataloader()` and have defined a `test_step()`, you may also want to' ' define `test_epoch_end()` for accumulating stats.', RuntimeWarning @@ -1040,8 +1040,8 @@ def check_model_configuration(self, model: LightningModule): def check_testing_model_configuration(self, model: LightningModule): - has_test_step = self.is_overriden('test_step', model) - has_test_epoch_end = self.is_overriden('test_epoch_end', model) + has_test_step = self.is_overridden('test_step', model) + has_test_epoch_end = self.is_overridden('test_epoch_end', model) gave_test_loader = hasattr(model, 'test_dataloader') and model.test_dataloader() if gave_test_loader and not has_test_step: diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index b2ce8599bc9a0..073403c9b4582 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -271,7 +271,7 @@ def detect_nan_tensors(self, *args): """Warning: this is just empty shell for code implemented in other class.""" @abstractmethod - def is_overriden(self, *args): + def is_overridden(self, *args): """Warning: this is just empty shell for code implemented in other class.""" @abstractmethod @@ -419,9 +419,9 @@ def run_training_epoch(self): _outputs = self.run_training_batch(batch, batch_idx) batch_result, grad_norm_dic, batch_step_metrics, batch_output = _outputs - # only track outputs when user implementes training_epoch_end - # otherwise we will build up unecessary memory - if self.is_overriden('training_epoch_end', model=self.get_model()): + # only track outputs when user implements training_epoch_end + # otherwise we will build up unnecessary memory + if self.is_overridden('training_epoch_end', model=self.get_model()): outputs.append(batch_output) # when returning -1 from train_step, we end epoch early @@ -484,7 +484,7 @@ def run_training_epoch(self): # process epoch outputs model = self.get_model() - if self.is_overriden('training_epoch_end', model=model): + if self.is_overridden('training_epoch_end', model=model): epoch_output = model.training_epoch_end(outputs) _processed_outputs = self.process_output(epoch_output) log_epoch_metrics = _processed_outputs[2] @@ -493,7 +493,7 @@ def run_training_epoch(self): self.callback_metrics.update(callback_epoch_metrics) # when no val loop is present or fast-dev-run still need to call checkpoints - if not self.is_overriden('validation_step') and not (self.fast_dev_run or should_check_val): + if not self.is_overridden('validation_step') and not (self.fast_dev_run or should_check_val): self.call_checkpoint_callback() self.call_early_stop_callback() @@ -539,7 +539,7 @@ def run_training_batch(self, batch, batch_idx): self.split_idx = split_idx for opt_idx, optimizer in self._get_optimizers_iterable(): - # make sure only the gradients of the current optimizer's paramaters are calculated + # make sure only the gradients of the current optimizer's parameters are calculated # in the training step to prevent dangling gradients in multiple-optimizer setup. if len(self.optimizers) > 1: for param in self.get_model().parameters(): @@ -737,14 +737,14 @@ def training_forward(self, batch, batch_idx, opt_idx, hiddens): # allow any mode to define training_step_end # do something will all the dp outputs (like softmax) - if self.is_overriden('training_step_end'): + if self.is_overridden('training_step_end'): model_ref = self.get_model() with self.profiler.profile('training_step_end'): output = model_ref.training_step_end(output) # allow any mode to define training_end # TODO: remove in 1.0.0 - if self.is_overriden('training_end'): + if self.is_overridden('training_end'): model_ref = self.get_model() with self.profiler.profile('training_end'): output = model_ref.training_end(output) diff --git a/setup.py b/setup.py index dfcaf29a5990e..3f77570dee5a6 100755 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def load_requirements(path_dir=PATH_ROOT, comment_char='#'): return reqs -def load_long_describtion(): +def load_long_description(): # https://github.com/PyTorchLightning/pytorch-lightning/raw/master/docs/source/_images/lightning_module/pt_to_pl.png url = os.path.join(pytorch_lightning.__homepage__, 'raw', pytorch_lightning.__version__, 'docs') text = open('README.md', encoding='utf-8').read() @@ -59,7 +59,7 @@ def load_long_describtion(): license=pytorch_lightning.__license__, packages=find_packages(exclude=['tests', 'tests/*', 'benchmarks']), - long_description=load_long_describtion(), + long_description=load_long_description(), long_description_content_type='text/markdown', include_package_data=True, zip_safe=False, diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 5bdb603e14518..49d8b658f77c8 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -273,13 +273,13 @@ def test_parse_gpu_fail_on_unsupported_inputs(mocked_device_count, gpus): @pytest.mark.gpus_param_tests @pytest.mark.parametrize("gpus", [[1, 2, 19], -1, '-1']) -def test_parse_gpu_fail_on_non_existant_id(mocked_device_count_0, gpus): +def test_parse_gpu_fail_on_non_existent_id(mocked_device_count_0, gpus): with pytest.raises(MisconfigurationException): parse_gpu_ids(gpus) @pytest.mark.gpus_param_tests -def test_parse_gpu_fail_on_non_existant_id_2(mocked_device_count): +def test_parse_gpu_fail_on_non_existent_id_2(mocked_device_count): with pytest.raises(MisconfigurationException): parse_gpu_ids([1, 2, 19]) diff --git a/tests/trainer/test_checks.py b/tests/trainer/test_checks.py index 45155d67e65d7..7dbbaa9245caf 100755 --- a/tests/trainer/test_checks.py +++ b/tests/trainer/test_checks.py @@ -41,10 +41,10 @@ def test_wrong_configure_optimizers(tmpdir): def test_wrong_validation_settings(tmpdir): """ Test the following cases related to validation configuration of model: - * error if `val_dataloader()` is overriden but `validation_step()` is not - * if both `val_dataloader()` and `validation_step()` is overriden, + * error if `val_dataloader()` is overridden but `validation_step()` is not + * if both `val_dataloader()` and `validation_step()` is overridden, throw warning if `val_epoch_end()` is not defined - * error if `validation_step()` is overriden but `val_dataloader()` is not + * error if `validation_step()` is overridden but `val_dataloader()` is not """ tutils.reset_seed() hparams = tutils.get_default_hparams() @@ -71,10 +71,10 @@ def test_wrong_validation_settings(tmpdir): def test_wrong_test_settigs(tmpdir): """ Test the following cases related to test configuration of model: - * error if `test_dataloader()` is overriden but `test_step()` is not - * if both `test_dataloader()` and `test_step()` is overriden, + * error if `test_dataloader()` is overridden but `test_step()` is not + * if both `test_dataloader()` and `test_step()` is overridden, throw warning if `test_epoch_end()` is not defined - * error if `test_step()` is overriden but `test_dataloader()` is not + * error if `test_step()` is overridden but `test_dataloader()` is not """ hparams = tutils.get_default_hparams() trainer = Trainer(default_root_dir=tmpdir, max_epochs=1) From 3a642601e84c3abf1f1b438f9acc932a1f150f7f Mon Sep 17 00:00:00 2001 From: Yuri Brovman Date: Thu, 7 May 2020 09:26:41 -0400 Subject: [PATCH 41/43] added warning for None dataloader (#1745) * added warning for None dataloader * fixed variable style * updated warning message * remove unused import Co-authored-by: ybrovman --- pytorch_lightning/trainer/data_loading.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 0428f4951e596..3557e2c83a761 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -218,6 +218,9 @@ def _reset_eval_dataloader(self, model: LightningModule, mode: str) -> Tuple[int f'Your {mode}_dataloader has shuffle=True, it is best practice to turn' ' this off for validation and test dataloaders.') + if any([dl is None for dl in dataloaders]): + rank_zero_warn("One of given dataloaders is None and it will be skipped.") + # add samplers dataloaders = [self.auto_add_sampler(dl, train=False) for dl in dataloaders if dl is not None] From 25bbd059df68abc1b0ffa77ad2480af183d61b05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 9 May 2020 05:31:56 +0200 Subject: [PATCH 42/43] Also update progress_bar in training_epoch_end (#1724) * update prog. bar metrics on train epoch end * changelog * wip test * more thorough testing * comments * update docs * move test Co-authored-by: Jirka --- CHANGELOG.md | 2 + pytorch_lightning/core/lightning.py | 5 ++- pytorch_lightning/trainer/training_loop.py | 1 + tests/models/test_module_hooks.py | 46 ++++++++++++++++++++++ 4 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 tests/models/test_module_hooks.py diff --git a/CHANGELOG.md b/CHANGELOG.md index f3190f9d7353d..46c2ec88fe52b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added type hints in `Trainer.fit()` and `Trainer.test()` to reflect that also a list of dataloaders can be passed in ([#1723](https://github.com/PyTorchLightning/pytorch-lightning/pull/1723)). +- The progress bar metrics now also get updated in `training_epoch_end` ([#1724](https://github.com/PyTorchLightning/pytorch-lightning/pull/1724)). + ### Changed - Reduction when `batch_size < num_gpus` ([#1609](https://github.com/PyTorchLightning/pytorch-lightning/pull/1609)) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 1b906c0f9089a..8638a512a50f8 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -257,6 +257,7 @@ def training_epoch_end( May contain the following optional keys: - log (metrics to be added to the logger; only tensors) + - progress_bar (dict for progress bar display) - any metric used in a callback (e.g. early stopping). Note: @@ -280,7 +281,8 @@ def training_epoch_end(self, outputs): # log training accuracy at the end of an epoch results = { - 'log': {'train_acc': train_acc_mean.item()} + 'log': {'train_acc': train_acc_mean.item()}, + 'progress_bar': {'train_acc': train_acc_mean}, } return results @@ -303,6 +305,7 @@ def training_epoch_end(self, outputs): # log training accuracy at the end of an epoch results = { 'log': {'train_acc': train_acc_mean.item(), 'step': self.current_epoch} + 'progress_bar': {'train_acc': train_acc_mean}, } return results """ diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 073403c9b4582..2eb033c000f27 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -491,6 +491,7 @@ def run_training_epoch(self): callback_epoch_metrics = _processed_outputs[3] self.log_metrics(log_epoch_metrics, {}) self.callback_metrics.update(callback_epoch_metrics) + self.add_progress_bar_metrics(_processed_outputs[1]) # when no val loop is present or fast-dev-run still need to call checkpoints if not self.is_overridden('validation_step') and not (self.fast_dev_run or should_check_val): diff --git a/tests/models/test_module_hooks.py b/tests/models/test_module_hooks.py new file mode 100644 index 0000000000000..c2f5056bd171f --- /dev/null +++ b/tests/models/test_module_hooks.py @@ -0,0 +1,46 @@ +import torch + +from pytorch_lightning import Trainer +from tests.base import EvalModelTemplate + +import tests.base.utils as tutils + + +def test_training_epoch_end_metrics_collection(tmpdir): + """ Test that progress bar metrics also get collected at the end of an epoch. """ + num_epochs = 3 + class CurrentModel(EvalModelTemplate): + + def training_step(self, *args, **kwargs): + output = super().training_step(*args, **kwargs) + output['progress_bar'].update({'step_metric': torch.tensor(-1)}) + output['progress_bar'].update({'shared_metric': 100}) + return output + + def training_epoch_end(self, outputs): + epoch = self.current_epoch + # both scalar tensors and Python numbers are accepted + return { + 'progress_bar': { + f'epoch_metric_{epoch}': torch.tensor(epoch), # add a new metric key every epoch + 'shared_metric': 111, + } + } + + model = CurrentModel(tutils.get_default_hparams()) + trainer = Trainer( + max_epochs=num_epochs, + default_root_dir=tmpdir, + overfit_pct=0.1, + ) + result = trainer.fit(model) + assert result == 1 + metrics = trainer.progress_bar_dict + + # metrics added in training step should be unchanged by epoch end method + assert metrics['step_metric'] == -1 + # a metric shared in both methods gets overwritten by epoch_end + assert metrics['shared_metric'] == 111 + # metrics are kept after each epoch + for i in range(num_epochs): + assert metrics[f'epoch_metric_{i}'] == i From bba5061d945a0a74469bae155f58781412403659 Mon Sep 17 00:00:00 2001 From: Lars Gohr Date: Tue, 28 Mar 2023 13:48:48 +0200 Subject: [PATCH 43/43] Updated elgohr/Publish-Docker-Github-Action to a supported version (v5) --- .github/workflows/docker_builds.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker_builds.yml b/.github/workflows/docker_builds.yml index 736ff72460d74..06c5011dcb428 100644 --- a/.github/workflows/docker_builds.yml +++ b/.github/workflows/docker_builds.yml @@ -22,7 +22,7 @@ jobs: - uses: actions/checkout@v2 - name: Publish Releases to Docker # only on releases - uses: elgohr/Publish-Docker-Github-Action@2.14 + uses: elgohr/Publish-Docker-Github-Action@v5 if: contains(github.ref, 'refs/tags/') && !contains(${{ steps.get_version.outputs.VERSION }}, 'rc') %% !contains(${{ steps.get_version.outputs.VERSION }}, 'dev') with: name: pytorchlightning/pytorch_lightning @@ -33,7 +33,7 @@ jobs: tags: "${{ steps.get_version.outputs.VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},stable-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}" - name: Publish Master # publish master - uses: elgohr/Publish-Docker-Github-Action@2.14 + uses: elgohr/Publish-Docker-Github-Action@v5 if: github.event_name == 'push' with: name: pytorchlightning/pytorch_lightning