Ampt (#2572)

williamFalcon · web-flow · commit e068af9ea8c8 · 2020-07-09T21:28:11.000-04:00
* remove grad scaling tpu

* remove grad scaling tpu

* remove grad scaling tpu

* remove grad scaling tpu

* remove grad scaling tpu

* remove grad scaling tpu

* remove grad scaling tpu

* remove grad scaling tpu

* remove grad scaling tpu
diff --git a/pytorch_lightning/core/memory.py b/pytorch_lightning/core/memory.py
@@ -209,7 +209,7 @@ def _forward_example_input(self) -> None:
         input_ = model.example_input_array
         input_ = model.transfer_batch_to_device(input_, model.device)
 
-        if trainer is not None and trainer.use_amp:
+        if trainer is not None and trainer.use_amp and not trainer.use_tpu:
             if NATIVE_AMP_AVALAIBLE:
                 model.forward = torch.cuda.amp.autocast()(model.forward)
 
diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py
@@ -240,14 +240,14 @@ def dp_train(self, model):
 
         # hack forward to do autocast for the user
         model_autocast_original_forward = model.forward
-        if self.use_amp and NATIVE_AMP_AVALAIBLE:
+        if self.use_amp and NATIVE_AMP_AVALAIBLE and not self.use_tpu:
             # wrap the user's forward in autocast and give it back at the end
             model.forward = torch.cuda.amp.autocast()(model.forward)
 
         # TODO: remove with dropping NVIDIA AMP support
         # check for this bug (amp + dp + !01 doesn't work)
         # https://github.com/NVIDIA/apex/issues/227
-        if self.use_dp and self.use_amp and not NATIVE_AMP_AVALAIBLE:
+        if self.use_dp and self.use_amp and not NATIVE_AMP_AVALAIBLE and not self.use_tpu:
             if self.amp_level == 'O2':
                 raise MisconfigurationException(
                     f'Amp level {self.amp_level} with DataParallel is not supported.'
diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py
@@ -286,7 +286,7 @@ def _evaluate(
                 # -----------------
                 # RUN EVALUATION STEP
                 # -----------------
-                if self.use_amp and NATIVE_AMP_AVALAIBLE:
+                if self.use_amp and NATIVE_AMP_AVALAIBLE and not self.use_tpu:
                     with torch.cuda.amp.autocast():
                         output = self.evaluation_forward(model, batch, batch_idx, dataloader_idx, test_mode)
                 else:
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -1118,7 +1118,7 @@ def run_pretrain_routine(self, model: LightningModule):
         self.copy_trainer_model_properties(ref_model)
 
         # init amp. Must be done here instead of __init__ to allow ddp to work
-        if NATIVE_AMP_AVALAIBLE and self.precision == 16:
+        if NATIVE_AMP_AVALAIBLE and self.precision == 16 and not self.use_tpu:
             self.scaler = torch.cuda.amp.GradScaler()
 
         # log hyper-parameters
@@ -1300,6 +1300,11 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders):
             if ckpt_path == 'best':
                 ckpt_path = self.checkpoint_callback.best_model_path
 
+            if len(ckpt_path) == 0:
+                rank_zero_warn(f'.test() found no path for the best weights, {ckpt_path}. Please '
+                               f'specify a path for a checkpoint .test(ckpt_path=PATH)')
+                return {}
+
             ckpt = torch.load(ckpt_path, map_location=lambda storage, loc: storage)
             model.load_state_dict(ckpt['state_dict'])
 
diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py
@@ -358,7 +358,7 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
             checkpoint['lr_schedulers'] = lr_schedulers
 
             # save native amp scaling
-            if self.use_amp and NATIVE_AMP_AVALAIBLE:
+            if self.use_amp and NATIVE_AMP_AVALAIBLE and not self.use_tpu:
                 checkpoint['native_amp_scaling_state'] = self.scaler.state_dict()
 
         # add the module_arguments and state_dict from the model
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
@@ -702,7 +702,7 @@ def run_batch_backward_pass(self, split_batch, batch_idx, opt_idx, optimizer):
         # ------------------
         # CLIP GRADS
         # ------------------
-        if self.use_amp and NATIVE_AMP_AVALAIBLE:
+        if self.use_amp and NATIVE_AMP_AVALAIBLE and not self.use_tpu:
             self.scaler.unscale_(optimizer)
         self.clip_gradients()
 
@@ -750,7 +750,7 @@ def call_optimizer_step(self, optimizer, opt_idx, batch_idx, split_batch):
                                      using_native_amp=native_amp)
 
             # in native 16-bit we need to update scaler after optimizer step
-            if self.use_amp and NATIVE_AMP_AVALAIBLE:
+            if self.use_amp and NATIVE_AMP_AVALAIBLE and not self.use_tpu:
                 self.scaler.update()
 
             # model hook
@@ -767,7 +767,7 @@ def optimizer_closure(self, split_batch, batch_idx, opt_idx, optimizer, hiddens)
         # FORWARD
         # ---------------------------
         with self.profiler.profile('model_forward'):
-            if self.use_amp and NATIVE_AMP_AVALAIBLE:
+            if self.use_amp and NATIVE_AMP_AVALAIBLE and not self.use_tpu:
                 with torch.cuda.amp.autocast():
                     training_step_output = self.training_forward(split_batch, batch_idx,
                                                                  opt_idx, hiddens)
@@ -817,7 +817,7 @@ def optimizer_closure(self, split_batch, batch_idx, opt_idx, optimizer, hiddens)
             model_ref.backward(self, closure_loss, optimizer, opt_idx)
 
             # exit amp context
-            if self.precision == 16 and not NATIVE_AMP_AVALAIBLE:
+            if self.precision == 16 and not NATIVE_AMP_AVALAIBLE and not self.on_tpu:
                 a, b, c = None, None, None
                 error = context.__exit__(a, b, c)
                 if error: