allow for one to customize objective to predict x0 or noise, default to predict epsilon given results from @Lamikins

lucidrains · lucidrains · commit 19862012f9f6 · 2023-01-27T17:10:59.000-08:00
diff --git a/rin_pytorch/rin_pytorch.py b/rin_pytorch/rin_pytorch.py
@@ -493,6 +493,7 @@ def __init__(
         timesteps = 1000,
         use_ddim = False,
         noise_schedule = 'sigmoid',
+        objective = 'eps',
         schedule_kwargs: dict = dict(),
         time_difference = 0.,
         train_prob_self_cond = 0.9
@@ -501,6 +502,9 @@ def __init__(
         self.model = model
         self.channels = self.model.channels
 
+        assert objective in {'x0', 'eps'}, 'objective must be either predict x0 or noise'
+        self.objective = objective
+
         self.image_size = image_size
 
         if noise_schedule == "linear":
@@ -560,11 +564,7 @@ def ddpm_sample(self, shape, time_difference = None):
 
             # get predicted x0
 
-            x_start, last_latents = self.model(img, noise_cond, x_start, last_latents, return_latents = True)
-
-            # clip x0
-
-            x_start.clamp_(-1., 1.)
+            model_output, last_latents = self.model(img, noise_cond, x_start, last_latents, return_latents = True)
 
             # get log(snr)
 
@@ -577,6 +577,18 @@ def ddpm_sample(self, shape, time_difference = None):
             alpha, sigma = log_snr_to_alpha_sigma(log_snr)
             alpha_next, sigma_next = log_snr_to_alpha_sigma(log_snr_next)
 
+            # calculate x0 and noise
+
+            if self.objective == 'x0':
+                x_start = model_output
+
+            elif self.objective == 'eps':
+                x_start = (img - sigma * model_output) / alpha
+
+            # clip x0
+
+            x_start.clamp_(-1., 1.)
+
             # derive posterior mean and variance
 
             c = -expm1(log_snr - log_snr_next)
@@ -628,15 +640,27 @@ def ddim_sample(self, shape, time_difference = None):
 
             # predict x0
 
-            x_start, last_latents = self.model(img, log_snr, x_start, last_latents, return_latents = True)
+            model_output, last_latents = self.model(img, log_snr, x_start, last_latents, return_latents = True)
+
+            # calculate x0 and noise
+
+            if self.objective == 'x0':
+                x_start = model_output
+
+            elif self.objective == 'eps':
+                x_start = (img - sigma * model_output) / alpha
 
             # clip x0
 
             x_start.clamp_(-1., 1.)
 
             # get predicted noise
 
-            pred_noise = (img - alpha * x_start) / sigma.clamp(min = 1e-8)
+            if self.objective == 'x0':
+                pred_noise = (img - alpha * x_start) / sigma.clamp(min = 1e-8)
+
+            elif self.objective == 'eps':
+                pred_noise = model_output
 
             # calculate x next
 
@@ -687,7 +711,13 @@ def forward(self, img, *args, **kwargs):
 
         pred = self.model(noised_img, noise_level, self_cond, self_latents)
 
-        return F.mse_loss(pred, img)
+        if self.objective == 'x0':
+            target = img
+
+        elif self.objective == 'eps':
+            target = noise
+
+        return F.mse_loss(pred, target)
 
 # dataset classes
 
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'RIN-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.1.1',
+  version = '0.2.0',
   license='MIT',
   description = 'RIN - Recurrent Interface Network - Pytorch',
   author = 'Phil Wang',