Training optimizations (#217)

david-ford · web-flow · commit 4fad71cd8c63 · 2022-08-30T15:59:32.000-04:00
* Optimizations to the training model

Based on the changes made in
textual_inversion I carried over the relevant changes that improve model training. These changes reduce the amount of memory used, significantly improve the speed at which training runs, and improves the quality of the results.

It also fixes the problem where the model trainer wouldn't automatically stop when it hit the set number of steps.

* Update main.py

Cleaned up whitespace
diff --git a/configs/stable-diffusion/v1-finetune.yaml b/configs/stable-diffusion/v1-finetune.yaml
@@ -52,7 +52,7 @@ model:
         ddconfig:
           double_z: true
           z_channels: 4
-          resolution: 256
+          resolution: 512
           in_channels: 3
           out_ch: 3
           ch: 128
@@ -73,7 +73,7 @@ model:
 data:
   target: main.DataModuleFromConfig
   params:
-    batch_size: 2
+    batch_size: 1
     num_workers: 16
     wrap: false
     train:
@@ -92,6 +92,9 @@ data:
         repeats: 10
 
 lightning:
+  modelcheckpoint:
+    params:
+      every_n_train_steps: 500
   callbacks:
     image_logger:
       target: main.ImageLogger
diff --git a/main.py b/main.py
@@ -171,8 +171,8 @@ def str2bool(v):
         help='Initialize embedding manager from a checkpoint',
     )
     parser.add_argument(
-        '--placeholder_tokens', type=str, nargs='+', default=['*']
-    )
+        '--placeholder_tokens', type=str, nargs='+', default=['*'],
+        help='Placeholder token which will be used to denote the concept in future prompts')
 
     parser.add_argument(
         '--init_word',
@@ -473,7 +473,7 @@ def log_img(self, pl_module, batch, batch_idx, split='train'):
             self.check_frequency(check_idx)
             and hasattr(  # batch_idx % self.batch_freq == 0
                 pl_module, 'log_images'
-            )
+            ) 
             and callable(pl_module.log_images)
             and self.max_images > 0
         ):
@@ -569,6 +569,21 @@ def on_train_epoch_end(self, trainer, pl_module, outputs):
         except AttributeError:
             pass
 
+class ModeSwapCallback(Callback):
+
+    def __init__(self, swap_step=2000):
+        super().__init__()
+        self.is_frozen = False
+        self.swap_step = swap_step
+
+    def on_train_epoch_start(self, trainer, pl_module):
+        if trainer.global_step < self.swap_step and not self.is_frozen:
+            self.is_frozen = True
+            trainer.optimizers = [pl_module.configure_opt_embedding()]
+
+        if trainer.global_step > self.swap_step and self.is_frozen:
+            self.is_frozen = False
+            trainer.optimizers = [pl_module.configure_opt_model()]
 
 if __name__ == '__main__':
     # custom parser to specify config files, train, test and debug mode,
@@ -663,6 +678,7 @@ def on_train_epoch_end(self, trainer, pl_module, outputs):
         if opt.datadir_in_name:
             now = os.path.basename(os.path.normpath(opt.data_root)) + now
 
+
         nowname = now + name + opt.postfix
         logdir = os.path.join(opt.logdir, nowname)
 
@@ -756,7 +772,7 @@ def on_train_epoch_end(self, trainer, pl_module, outputs):
         if hasattr(model, 'monitor'):
             print(f'Monitoring {model.monitor} as checkpoint metric.')
             default_modelckpt_cfg['params']['monitor'] = model.monitor
-            default_modelckpt_cfg['params']['save_top_k'] = 3
+            default_modelckpt_cfg['params']['save_top_k'] = 1
 
         if 'modelcheckpoint' in lightning_config:
             modelckpt_cfg = lightning_config.modelcheckpoint
@@ -846,7 +862,7 @@ def on_train_epoch_end(self, trainer, pl_module, outputs):
         trainer_kwargs['callbacks'] = [
             instantiate_from_config(callbacks_cfg[k]) for k in callbacks_cfg
         ]
-        trainer_kwargs['max_steps'] = opt.max_steps
+        trainer_kwargs['max_steps'] = trainer_opt.max_steps
 
         trainer = Trainer.from_argparse_args(trainer_opt, **trainer_kwargs)
         trainer.logdir = logdir  ###