thegenerativegeneration
diff --git a/‎README.md‎
Lines changed: 4 additions & 4 deletions b/‎README.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎config/vox-256.yaml‎
Lines changed: 7 additions & 7 deletions b/‎config/vox-256.yaml‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎config/vox-512.yaml‎ renamed to ‎config/vox-512-finetune.yaml‎
Lines changed: 13 additions & 11 deletions b/‎config/vox-512.yaml‎ renamed to ‎config/vox-512-finetune.yaml‎
Lines changed: 13 additions & 11 deletions
diff --git a/‎config/vox-768.yaml‎ renamed to ‎config/vox-768-finetune.yaml‎
Lines changed: 11 additions & 9 deletions b/‎config/vox-768.yaml‎ renamed to ‎config/vox-768-finetune.yaml‎
Lines changed: 11 additions & 9 deletions
diff --git a/‎demo.py‎
Lines changed: 42 additions & 18 deletions b/‎demo.py‎
Lines changed: 42 additions & 18 deletions
diff --git a/‎frames_dataset.py‎
Lines changed: 0 additions & 2 deletions b/‎frames_dataset.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎logger.py‎
Lines changed: 18 additions & 8 deletions b/‎logger.py‎
Lines changed: 18 additions & 8 deletions
diff --git a/‎modules/bg_motion_predictor.py‎
Lines changed: 2 additions & 1 deletion b/‎modules/bg_motion_predictor.py‎
Lines changed: 2 additions & 1 deletion
@@ -52,15 +52,15 @@ See description of the parameters in the ```config/taichi-256.yaml```.
 ### Training
 To train a model on specific dataset run:
 ```
-CUDA_VISIBLE_DEVICES=0,1 python run.py --config config/dataset_name.yaml --device_ids 0,1
+accelerate launch run.py --config config/dataset_name.yaml --device_ids 0,1
 ```
 A log folder named after the timestamp will be created. Checkpoints, loss values, reconstruction results will be saved to this folder.
 
 
 #### Training AVD network
 To train a model on specific dataset run:
 ```
-CUDA_VISIBLE_DEVICES=0 python run.py --mode train_avd --checkpoint '{checkpoint_folder}/checkpoint.pth.tar' --config config/dataset_name.yaml
+accelerate launch run.py --mode train_avd --checkpoint '{checkpoint_folder}/checkpoint.pth.tar' --config config/dataset_name.yaml
 ```
 Checkpoints, loss values, reconstruction results will be saved to `{checkpoint_folder}`.
 
@@ -70,7 +70,7 @@ Checkpoints, loss values, reconstruction results will be saved to `{checkpoint_f
 
 To evaluate the reconstruction performance run:
 ```
-CUDA_VISIBLE_DEVICES=0 python run.py --mode reconstruction --config config/dataset_name.yaml --checkpoint '{checkpoint_folder}/checkpoint.pth.tar'
+accelerate launch run.py --mode reconstruction --config config/dataset_name.yaml --checkpoint '{checkpoint_folder}/checkpoint.pth.tar'
 ```
 The `reconstruction` subfolder will be created in `{checkpoint_folder}`.
 The generated video will be stored to this folder, also generated videos will be stored in ```png``` subfolder in loss-less '.png' format for evaluation.
@@ -81,7 +81,7 @@ To compute metrics, follow instructions from [pose-evaluation](https://github.co
 - notebook: `demo.ipynb`, edit the config cell and run for image animation.
 - python:
 ```bash
-CUDA_VISIBLE_DEVICES=0 python demo.py --config config/vox-256.yaml --checkpoint checkpoints/vox.pth.tar --source_image ./source.jpg --driving_video ./driving.mp4
+python demo.py --config config/vox-256.yaml --checkpoint checkpoints/vox.pth.tar --source_image ./source.jpg --driving_video ./driving.mp4
 ```
 
 # Acknowledgments
 
@@ -1,5 +1,5 @@
 dataset_params:
-  root_dir: ../vox
+  root_dir: vox_256
   frame_shape: null
   id_sampling: True
   augmentation_params:
@@ -58,12 +58,12 @@ train_params:
     bg: 10
 
 train_avd_params:
-  num_epochs: 200
-  num_repeats: 300
-  batch_size: 256
-  dataloader_workers: 24
-  checkpoint_freq: 50
-  epoch_milestones: [140, 180]
+  num_epochs: 100
+  num_repeats: 1
+  batch_size: 8
+  dataloader_workers: 6
+  checkpoint_freq: 1
+  epoch_milestones: [10, 20]
   lr: 1.0e-3
   lambda_shift: 1
   random_scale: 0.25
 
@@ -1,3 +1,4 @@
+# Use this file to finetune from a pretrained 256x256 model
 dataset_params:
   root_dir: vox
   frame_shape: null
@@ -35,18 +36,19 @@ model_params:
 
 train_params:
   num_epochs: 100
-  num_repeats: 75
+  num_repeats: 10
   epoch_milestones: [70, 90]
-  lr_generator: 2.0e-4
+  # Higher LR seems to bring problems when finetuning
+  lr_generator: 2.0e-5
   batch_size: 4
   scales: [1, 0.5, 0.25, 0.125]
-  dataloader_workers: 12
-  checkpoint_freq: 1
-  dropout_epoch: 35
+  dataloader_workers: 6
+  checkpoint_freq: 2
+  dropout_epoch: 0
   dropout_maxp: 0.3
   dropout_startp: 0.1
   dropout_inc_epoch: 10
-  bg_start: 10
+  bg_start: 0
   transform_params:
     sigma_affine: 0.05
     sigma_tps: 0.005
@@ -59,11 +61,11 @@ train_params:
 
 train_avd_params:
   num_epochs: 200
-  num_repeats: 300
-  batch_size: 256
-  dataloader_workers: 24
-  checkpoint_freq: 50
-  epoch_milestones: [140, 180]
+  num_repeats: 1
+  batch_size: 4
+  dataloader_workers: 6
+  checkpoint_freq: 2
+  epoch_milestones: [10, 20]
   lr: 1.0e-3
   lambda_shift: 1
   random_scale: 0.25
 
@@ -1,3 +1,4 @@
+# Use this file to finetune from a pretrained 256x256 model
 dataset_params:
   root_dir: vox_768
   frame_shape: null
@@ -35,18 +36,19 @@ model_params:
 
 train_params:
   num_epochs: 100
-  num_repeats: 75
+  num_repeats: 1
   epoch_milestones: [70, 90]
-  lr_generator: 2.0e-4
+  # Higher LR seems to bring problems when finetuning
+  lr_generator: 2.0e-5
   batch_size: 1
   scales: [1, 0.5, 0.25, 0.125]
-  dataloader_workers: 12
+  dataloader_workers: 6
   checkpoint_freq: 1
-  dropout_epoch: 35
+  dropout_epoch: 0
   dropout_maxp: 0.3
   dropout_startp: 0.1
   dropout_inc_epoch: 10
-  bg_start: 10
+  bg_start: 0
   transform_params:
     sigma_affine: 0.05
     sigma_tps: 0.005
@@ -59,10 +61,10 @@ train_params:
 
 train_avd_params:
   num_epochs: 200
-  num_repeats: 300
-  batch_size: 256
-  dataloader_workers: 24
-  checkpoint_freq: 50
+  num_repeats: 1
+  batch_size: 1
+  dataloader_workers: 6
+  checkpoint_freq: 1
   epoch_milestones: [140, 180]
   lr: 1.0e-3
   lambda_shift: 1
 
@@ -67,19 +67,24 @@ def load_checkpoints(config_path, checkpoint_path, device):
     return inpainting, kp_detector, dense_motion_network, avd_network
 
 
-def make_animation(source_image, driving_video, inpainting_network, kp_detector, dense_motion_network, avd_network, device:torch.device, mode = 'relative'):
+def make_animation(source_image, driving_video_generator, inpainting_network, kp_detector, dense_motion_network, avd_network, device:torch.device, mode = 'relative'):
     assert mode in ['standard', 'relative', 'avd']
     with torch.no_grad():
         with torch.autocast(device_type=str(device), dtype=torch.float16):
             source = torch.tensor(source_image[np.newaxis].astype(np.float32)).permute(0, 3, 1, 2)
             source = source.to(device)
-            driving = torch.tensor(np.array(driving_video)[np.newaxis].astype(np.float32)).permute(0, 4, 1, 2, 3).to(device)
+            #driving = torch.tensor(np.array(driving_video)[np.newaxis].astype(np.float32)).permute(0, 4, 1, 2, 3).to(device)
             kp_source = kp_detector(source)
-            kp_driving_initial = kp_detector(driving[:, :, 0])
 
-            for frame_idx in tqdm(range(driving.shape[2])):
-                driving_frame = driving[:, :, frame_idx]
-                driving_frame = driving_frame.to(device)
+            first_frame = True
+
+            for driving_frame_np in tqdm(driving_video_generator):
+
+                driving_frame = torch.tensor(driving_frame_np[np.newaxis].astype(np.float32)).permute(0, 3, 1, 2).to(
+                    device)
+                if first_frame:
+                    kp_driving_initial = kp_detector(driving_frame)
+                    first_frame = False
                 kp_driving = kp_detector(driving_frame)
                 if mode == 'standard':
                     kp_norm = kp_driving
@@ -95,7 +100,6 @@ def make_animation(source_image, driving_video, inpainting_network, kp_detector,
 
                 yield np.transpose(out['prediction'].data.cpu().numpy(), [0, 2, 3, 1])[0]
 
-
 def find_best_frame(source, driving, cpu):
     import face_alignment
 
@@ -123,7 +127,32 @@ def normalize_kp(kp):
         except:
             pass
     return frame_num
+def read_and_resize_frames(video_path, img_shape):
+    reader = imageio.get_reader(video_path)
+    for frame in reader:
+        resized_frame = resize(frame, img_shape)[..., :3]
+        yield resized_frame
+    reader.close()
 
+def read_and_resize_frames_forward(video_path, img_shape, start_frame):
+    reader = imageio.get_reader(video_path)
+    for idx, frame in enumerate(reader):
+        if idx < start_frame:
+            continue
+        resized_frame = resize(frame, img_shape)[..., :3]
+        yield resized_frame
+    reader.close()
+
+def read_and_resize_frames_backward(video_path, img_shape, end_frame):
+    reader = imageio.get_reader(video_path)
+    frames = []
+    for idx, frame in enumerate(reader):
+        if idx > end_frame:
+            break
+        resized_frame = resize(frame, img_shape)[..., :3]
+        frames.append(resized_frame)
+    reader.close()
+    return reversed(frames)
 
 if __name__ == "__main__":
     parser = ArgumentParser()
@@ -149,12 +178,6 @@ def normalize_kp(kp):
     source_image = imageio.imread(opt.source_image)
     reader = imageio.get_reader(opt.driving_video)
     fps = reader.get_meta_data()['fps']
-    driving_video = []
-    try:
-        for im in reader:
-            driving_video.append(im)
-    except RuntimeError:
-        pass
     reader.close()
 
     if opt.cpu:
@@ -163,7 +186,6 @@ def normalize_kp(kp):
         device = torch.device('cuda')
 
     source_image = resize(source_image, opt.img_shape)[..., :3]
-    driving_video = [resize(frame, opt.img_shape)[..., :3] for frame in driving_video]
     inpainting, kp_detector, dense_motion_network, avd_network = load_checkpoints(config_path = opt.config, checkpoint_path = opt.checkpoint, device = device)
 
     def reversed_generator(generator):
@@ -175,10 +197,11 @@ def append_frame_to_writer(frame, writer):
 
 
     if opt.find_best_frame:
-        i = find_best_frame(source_image, driving_video, opt.cpu)
+        driving_video_generator = read_and_resize_frames(opt.driving_video, opt.img_shape)
+        i = find_best_frame(source_image, driving_video_generator, opt.cpu)
         print("Best frame:", i)
-        driving_forward = driving_video[i:]
-        driving_backward = driving_video[:(i + 1)][::-1]
+        driving_forward = read_and_resize_frames_forward(opt.driving_video, opt.img_shape, i)
+        driving_backward = read_and_resize_frames_backward(opt.driving_video, opt.img_shape, i)
 
         with imageio.get_writer(opt.result_video, mode='I', fps=fps) as writer:
             # Generate and append frames for the reversed backward animation
@@ -196,6 +219,7 @@ def append_frame_to_writer(frame, writer):
                 append_frame_to_writer(frame, writer)
     else:
         with imageio.get_writer(opt.result_video, mode='I', fps=fps) as writer:
-            for frame in make_animation(source_image, driving_video, inpainting, kp_detector, dense_motion_network,
+            driving_video_generator = read_and_resize_frames(opt.driving_video, opt.img_shape)
+            for frame in make_animation(source_image, driving_video_generator, inpainting, kp_detector, dense_motion_network,
                                         avd_network, device=device, mode=opt.mode):
                 append_frame_to_writer(frame, writer)
@@ -74,7 +74,6 @@ def __init__(self, root_dir, frame_shape=(256, 256, 3), id_sampling=False, is_tr
 
         if os.path.exists(os.path.join(root_dir, 'train')):
             assert os.path.exists(os.path.join(root_dir, 'test'))
-            print("Use predefined train-test split.")
             if id_sampling:
                 train_videos = {os.path.basename(video).split('#')[0] for video in
                                 os.listdir(os.path.join(root_dir, 'train'))}
@@ -84,7 +83,6 @@ def __init__(self, root_dir, frame_shape=(256, 256, 3), id_sampling=False, is_tr
             test_videos = os.listdir(os.path.join(root_dir, 'test'))
             self.root_dir = os.path.join(self.root_dir, 'train' if is_train else 'test')
         else:
-            print("Use random train-test split.")
             train_videos, test_videos = train_test_split(self.videos, random_state=random_seed, test_size=0.2)
 
         if is_train:
 
@@ -1,3 +1,5 @@
+from typing import Dict
+
 import numpy as np
 import torch
 import torch.nn.functional as F
@@ -15,6 +17,7 @@ class Logger:
     def __init__(self, log_dir, checkpoint_freq=50, visualizer_params=None,
                  zfill_num=8, log_file_name='log.txt', models=()):
 
+        self.models = None
         self.loss_list = []
         self.cpk_dir = log_dir
         self.visualizations_dir = os.path.join(log_dir, 'train-vis')
@@ -43,19 +46,21 @@ def log_scores(self, loss_names):
 
     def visualize_rec(self, inp, out):
         image = self.visualizer.visualize(inp['driving'], inp['source'], out)
-        imageio.imsave(os.path.join(self.visualizations_dir, "%s-rec.png" % str(self.epoch).zfill(self.zfill_num)), image)
+        imageio.imsave(os.path.join(self.visualizations_dir, "%s-rec.png" % str(self.epoch).zfill(self.zfill_num)),
+                       image)
+        wandb.log({"image": [wandb.Image(image)]})
 
     def save_cpk(self, emergent=False):
         cpk = {k: v.state_dict() for k, v in self.models.items()}
         cpk['epoch'] = self.epoch
-        cpk_path = os.path.join(self.cpk_dir, '%s-checkpoint.pth.tar' % str(self.epoch).zfill(self.zfill_num)) 
+        cpk_path = os.path.join(self.cpk_dir, '%s-checkpoint.pth.tar' % str(self.epoch).zfill(self.zfill_num))
         if not (os.path.exists(cpk_path) and emergent):
             torch.save(cpk, cpk_path)
 
     @staticmethod
-    def load_cpk(checkpoint_path, inpainting_network=None, dense_motion_network =None, kp_detector=None, 
-                bg_predictor=None, avd_network=None, optimizer=None, optimizer_bg_predictor=None,
-                optimizer_avd=None):
+    def load_cpk(checkpoint_path, inpainting_network=None, dense_motion_network=None, kp_detector=None,
+                 bg_predictor=None, avd_network=None, optimizer=None, optimizer_bg_predictor=None,
+                 optimizer_avd=None):
         checkpoint = torch.load(checkpoint_path)
         if inpainting_network is not None:
             inpainting_network.load_state_dict(checkpoint['inpainting_network'])
@@ -78,6 +83,9 @@ def load_cpk(checkpoint_path, inpainting_network=None, dense_motion_network =Non
         epoch = -1
         if 'epoch' in checkpoint:
             epoch = checkpoint['epoch']
+
+        print('Loaded checkpoint from epoch %d' % epoch)
+        print('keys: ', checkpoint.keys())
         return epoch
 
     def __enter__(self):
@@ -89,10 +97,12 @@ def __exit__(self, exc_type, exc_value, tb):
         self.log_file.close()
         wandb.finish()
 
-    def log_iter(self, losses):
+    def log_iter(self, losses, others: Dict = None):
         losses = collections.OrderedDict(losses.items())
         self.names = list(losses.keys())
         self.loss_list.append(list(losses.values()))
+        if others is not None:
+            losses.update(others)
         wandb.log(losses)
 
     def log_epoch(self, epoch, models, inp, out):
@@ -176,7 +186,6 @@ def visualize(self, driving, source, out):
             images.append((prediction, kp_norm))
         images.append(prediction)
 
-
         ## Occlusion map
         if 'occlusion_map' in out:
             for i in range(len(out['occlusion_map'])):
@@ -192,7 +201,7 @@ def visualize(self, driving, source, out):
                 image = out['deformed_source'][:, i].data.cpu()
                 # import ipdb;ipdb.set_trace()
                 image = F.interpolate(image, size=source.shape[1:3])
-                mask = out['contribution_maps'][:, i:(i+1)].data.cpu().repeat(1, 3, 1, 1)
+                mask = out['contribution_maps'][:, i:(i + 1)].data.cpu().repeat(1, 3, 1, 1)
                 mask = F.interpolate(mask, size=source.shape[1:3])
                 image = np.transpose(image.numpy(), (0, 2, 3, 1))
                 mask = np.transpose(mask.numpy(), (0, 2, 3, 1))
@@ -216,4 +225,5 @@ def visualize(self, driving, source, out):
 
         image = self.create_image_grid(*images)
         image = (255 * image).astype(np.uint8)
+
         return image
@@ -1,3 +1,4 @@
+import torchvision
 from torch import nn
 import torch
 from torchvision import models
@@ -9,7 +10,7 @@ class BGMotionPredictor(nn.Module):
 
     def __init__(self):
         super(BGMotionPredictor, self).__init__()
-        self.bg_encoder = models.resnet18(pretrained=False)
+        self.bg_encoder = models.resnet18(weights=torchvision.models.ResNet18_Weights.DEFAULT)
         self.bg_encoder.conv1 = nn.Conv2d(6, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
         num_features = self.bg_encoder.fc.in_features
         self.bg_encoder.fc = nn.Linear(num_features, 6)