lucidrains
diff --git a/‎README.md‎
Lines changed: 4 additions & 4 deletions b/‎README.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎denoising_diffusion_pytorch/classifier_free_guidance.py‎
Lines changed: 48 additions & 18 deletions b/‎denoising_diffusion_pytorch/classifier_free_guidance.py‎
Lines changed: 48 additions & 18 deletions
diff --git a/‎denoising_diffusion_pytorch/continuous_time_gaussian_diffusion.py‎
Lines changed: 1 addition & 13 deletions b/‎denoising_diffusion_pytorch/continuous_time_gaussian_diffusion.py‎
Lines changed: 1 addition & 13 deletions
diff --git a/‎denoising_diffusion_pytorch/denoising_diffusion_pytorch.py‎
Lines changed: 45 additions & 20 deletions b/‎denoising_diffusion_pytorch/denoising_diffusion_pytorch.py‎
Lines changed: 45 additions & 20 deletions
@@ -38,8 +38,7 @@ model = Unet(
 diffusion = GaussianDiffusion(
     model,
     image_size = 128,
-    timesteps = 1000,   # number of steps
-    loss_type = 'l1'    # L1 or L2
+    timesteps = 1000    # number of steps
 )
 
 training_images = torch.rand(8, 3, 128, 128) # images are normalized from 0 to 1
@@ -65,8 +64,7 @@ diffusion = GaussianDiffusion(
     model,
     image_size = 128,
     timesteps = 1000,           # number of steps
-    sampling_timesteps = 250,   # number of sampling timesteps (using ddim for faster inference [see citation for ddim paper])
-    loss_type = 'l1'            # L1 or L2
+    sampling_timesteps = 250    # number of sampling timesteps (using ddim for faster inference [see citation for ddim paper])
 )
 
 trainer = Trainer(
@@ -148,9 +146,11 @@ sampled_seq = diffusion.sample(batch_size = 4)
 sampled_seq.shape # (4, 32, 128)
 
 ```
+
 `Trainer1D` does not evaluate the generated samples in any way since the type of data is not known. 
 You could consider adding a suitable metric to the training loop yourself after doing an editable install of this package
 `pip install -e .`.
+
 ## Citations
 
 ```bibtex
 
@@ -2,7 +2,7 @@
 import copy
 from pathlib import Path
 from random import random
-from functools import partial
+from functools import partial, wraps
 from collections import namedtuple
 from multiprocessing import cpu_count
 
@@ -375,6 +375,7 @@ def forward_with_cond_scale(
         self,
         *args,
         cond_scale = 1.,
+        rescale_phi = 0.,
         **kwargs
     ):
         logits = self.forward(*args, cond_drop_prob = 0., **kwargs)
@@ -383,7 +384,18 @@ def forward_with_cond_scale(
             return logits
 
         null_logits = self.forward(*args, cond_drop_prob = 1., **kwargs)
-        return null_logits + (logits - null_logits) * cond_scale
+        scaled_logits = null_logits + (logits - null_logits) * cond_scale
+
+        if rescale_phi <= 0:
+            return scaled_logits
+
+        # rescaling proposed in https://arxiv.org/abs/2305.08891 to prevent over-saturation
+        # works for both pixel and latent space, as opposed to only pixel space with the technique from Imagen
+        # they found 0.7 to work well empirically with a conditional scale of 6.
+
+        std_fn = partial(torch.std, dim = tuple(range(1, scaled_logits.ndim)), keepdim = True)
+        rescaled_logits = scaled_logits * (std_fn(logits) / std_fn(scaled_logits))
+        return rescaled_logits * rescale_phi + (1 - rescale_phi) * scaled_logits
 
     def forward(
         self,
@@ -457,6 +469,33 @@ def extract(a, t, x_shape):
     out = a.gather(-1, t)
     return out.reshape(b, *((1,) * (len(x_shape) - 1)))
 
+def enforce_zero_terminal_snr(schedule_fn):
+    # algorithm 1 in https://arxiv.org/abs/2305.08891
+
+    @wraps(schedule_fn)
+    def inner(*args, **kwargs):
+        betas = schedule_fn(*args, **kwargs)
+        alphas = 1. - betas
+
+        alphas_cumprod = torch.cumprod(alphas, dim = 0)
+        alphas_cumprod = F.pad(alphas_cumprod[:-1], (1, 0), value = 1.)
+
+        alphas_cumprod_sqrt = torch.sqrt(alphas_cumprod)
+
+        terminal_snr = alphas_cumprod_sqrt[-1].clone()
+
+        alphas_cumprod_sqrt -= terminal_snr # enforce zero terminal snr
+        alphas_cumprod_sqrt *= 1. / (1. - terminal_snr)
+
+        alphas_cumprod = alphas_cumprod_sqrt ** 2
+        alphas = alphas_cumprod[1:] / alphas_cumprod[:-1]
+        betas = 1. - alphas
+
+        return betas
+
+    return inner
+
+@enforce_zero_terminal_snr
 def linear_beta_schedule(timesteps):
     scale = 1000 / timesteps
     beta_start = scale * 0.0001
@@ -473,7 +512,7 @@ def cosine_beta_schedule(timesteps, s = 0.008):
     alphas_cumprod = torch.cos(((x / timesteps) + s) / (1 + s) * math.pi * 0.5) ** 2
     alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
     betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
-    return torch.clip(betas, 0, 0.999)
+    return torch.clip(betas, 0, 1.)
 
 class GaussianDiffusion(nn.Module):
     def __init__(
@@ -606,8 +645,8 @@ def q_posterior(self, x_start, x_t, t):
         posterior_log_variance_clipped = extract(self.posterior_log_variance_clipped, t, x_t.shape)
         return posterior_mean, posterior_variance, posterior_log_variance_clipped
 
-    def model_predictions(self, x, t, classes, cond_scale = 3., clip_x_start = False):
-        model_output = self.model.forward_with_cond_scale(x, t, classes, cond_scale = cond_scale)
+    def model_predictions(self, x, t, classes, cond_scale = 6., rescale_phi = 0.7, clip_x_start = False):
+        model_output = self.model.forward_with_cond_scale(x, t, classes, cond_scale = cond_scale, rescale_phi = rescale_phi)
         maybe_clip = partial(torch.clamp, min = -1., max = 1.) if clip_x_start else identity
 
         if self.objective == 'pred_noise':
@@ -639,7 +678,7 @@ def p_mean_variance(self, x, t, classes, cond_scale, clip_denoised = True):
         return model_mean, posterior_variance, posterior_log_variance, x_start
 
     @torch.no_grad()
-    def p_sample(self, x, t: int, classes, cond_scale = 3., clip_denoised = True):
+    def p_sample(self, x, t: int, classes, cond_scale = 6., rescale_phi = 0.7, clip_denoised = True):
         b, *_, device = *x.shape, x.device
         batched_times = torch.full((x.shape[0],), t, device = x.device, dtype = torch.long)
         model_mean, _, model_log_variance, x_start = self.p_mean_variance(x = x, t = batched_times, classes = classes, cond_scale = cond_scale, clip_denoised = clip_denoised)
@@ -648,7 +687,7 @@ def p_sample(self, x, t: int, classes, cond_scale = 3., clip_denoised = True):
         return pred_img, x_start
 
     @torch.no_grad()
-    def p_sample_loop(self, classes, shape, cond_scale = 3.):
+    def p_sample_loop(self, classes, shape, cond_scale = 6., rescale_phi = 0.7):
         batch, device = shape[0], self.betas.device
 
         img = torch.randn(shape, device=device)
@@ -662,7 +701,7 @@ def p_sample_loop(self, classes, shape, cond_scale = 3.):
         return img
 
     @torch.no_grad()
-    def ddim_sample(self, classes, shape, cond_scale = 3., clip_denoised = True):
+    def ddim_sample(self, classes, shape, cond_scale = 6., rescale_phi = 0.7, clip_denoised = True):
         batch, device, total_timesteps, sampling_timesteps, eta, objective = shape[0], self.betas.device, self.num_timesteps, self.sampling_timesteps, self.ddim_sampling_eta, self.objective
 
         times = torch.linspace(-1, total_timesteps - 1, steps=sampling_timesteps + 1)   # [-1, 0, 1, 2, ..., T-1] when sampling_timesteps == total_timesteps
@@ -726,15 +765,6 @@ def q_sample(self, x_start, t, noise=None):
             extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
         )
 
-    @property
-    def loss_fn(self):
-        if self.loss_type == 'l1':
-            return F.l1_loss
-        elif self.loss_type == 'l2':
-            return F.mse_loss
-        else:
-            raise ValueError(f'invalid loss type {self.loss_type}')
-
     def p_losses(self, x_start, t, *, classes, noise = None):
         b, c, h, w = x_start.shape
         noise = default(noise, lambda: torch.randn_like(x_start))
@@ -757,7 +787,7 @@ def p_losses(self, x_start, t, *, classes, noise = None):
         else:
             raise ValueError(f'unknown objective {self.objective}')
 
-        loss = self.loss_fn(model_out, target, reduction = 'none')
+        loss = F.mse_loss(model_out, target, reduction = 'none')
         loss = reduce(loss, 'b ... -> b (...)', 'mean')
 
         loss = loss * extract(self.loss_weight, t, loss.shape)
 
@@ -116,7 +116,6 @@ def __init__(
         *,
         image_size,
         channels = 3,
-        loss_type = 'l1',
         noise_schedule = 'linear',
         num_sample_steps = 500,
         clip_sample_denoised = True,
@@ -138,8 +137,6 @@ def __init__(
 
         # continuous noise schedule related stuff
 
-        self.loss_type = loss_type
-
         if noise_schedule == 'linear':
             self.log_snr = beta_linear_log_snr
         elif noise_schedule == 'cosine':
@@ -170,15 +167,6 @@ def __init__(
     def device(self):
         return next(self.model.parameters()).device
 
-    @property
-    def loss_fn(self):
-        if self.loss_type == 'l1':
-            return F.l1_loss
-        elif self.loss_type == 'l2':
-            return F.mse_loss
-        else:
-            raise ValueError(f'invalid loss type {self.loss_type}')
-
     def p_mean_variance(self, x, time, time_next):
         # reviewer found an error in the equation in the paper (missing sigma)
         # following - https://openreview.net/forum?id=2LdBqxc1Yv&noteId=rIQgH0zKsRt
@@ -266,7 +254,7 @@ def p_losses(self, x_start, times, noise = None):
         x, log_snr = self.q_sample(x_start = x_start, times = times, noise = noise)
         model_out = self.model(x, log_snr)
 
-        losses = self.loss_fn(model_out, noise, reduction = 'none')
+        losses = F.mse_loss(model_out, noise, reduction = 'none')
         losses = reduce(losses, 'b ... -> b', 'mean')
 
         if self.min_snr_loss_weight:
 
@@ -2,7 +2,7 @@
 import copy
 from pathlib import Path
 from random import random
-from functools import partial
+from functools import partial, wraps
 from collections import namedtuple
 from multiprocessing import cpu_count
 
@@ -68,6 +68,11 @@ def convert_image_to_fn(img_type, image):
         return image.convert(img_type)
     return image
 
+def extract(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+
 # normalization functions
 
 def normalize_to_neg_one_to_one(img):
@@ -398,13 +403,35 @@ def forward(self, x, time, x_self_cond = None):
         x = self.final_res_block(x, t)
         return self.final_conv(x)
 
-# gaussian diffusion trainer class
+# scheduling functions
 
-def extract(a, t, x_shape):
-    b, *_ = t.shape
-    out = a.gather(-1, t)
-    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def enforce_zero_terminal_snr(schedule_fn):
+    # algorithm 1 in https://arxiv.org/abs/2305.08891
+
+    @wraps(schedule_fn)
+    def inner(*args, **kwargs):
+        betas = schedule_fn(*args, **kwargs)
+        alphas = 1. - betas
+
+        alphas_cumprod = torch.cumprod(alphas, dim = 0)
+        alphas_cumprod = F.pad(alphas_cumprod[:-1], (1, 0), value = 1.)
 
+        alphas_cumprod_sqrt = torch.sqrt(alphas_cumprod)
+
+        terminal_snr = alphas_cumprod_sqrt[-1].clone()
+
+        alphas_cumprod_sqrt -= terminal_snr # enforce zero terminal snr
+        alphas_cumprod_sqrt *= 1. / (1. - terminal_snr)
+
+        alphas_cumprod = alphas_cumprod_sqrt ** 2
+        alphas = alphas_cumprod[1:] / alphas_cumprod[:-1]
+        betas = 1. - alphas
+
+        return betas
+
+    return inner
+
+@enforce_zero_terminal_snr
 def linear_beta_schedule(timesteps):
     """
     linear schedule, proposed in original ddpm paper
@@ -426,6 +453,7 @@ def cosine_beta_schedule(timesteps, s = 0.008):
     betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
     return torch.clip(betas, 0, 1.)
 
+@enforce_zero_terminal_snr
 def sigmoid_beta_schedule(timesteps, start = -3, end = 3, tau = 1, clamp_min = 1e-5):
     """
     sigmoid schedule
@@ -441,6 +469,8 @@ def sigmoid_beta_schedule(timesteps, start = -3, end = 3, tau = 1, clamp_min = 1
     betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
     return torch.clip(betas, 0, 1.)
 
+# gaussian diffusion trainer class
+
 class GaussianDiffusion(nn.Module):
     def __init__(
         self,
@@ -449,7 +479,6 @@ def __init__(
         image_size,
         timesteps = 1000,
         sampling_timesteps = None,
-        loss_type = 'l1',
         objective = 'pred_noise',
         beta_schedule = 'sigmoid',
         schedule_fn_kwargs = dict(),
@@ -473,7 +502,9 @@ def __init__(
 
         assert objective in {'pred_noise', 'pred_x0', 'pred_v'}, 'objective must be either pred_noise (predict noise) or pred_x0 (predict image start) or pred_v (predict v [v-parameterization as defined in appendix D of progressive distillation paper, used in imagen-video successfully])'
 
-        if beta_schedule == 'linear':
+        if callable(beta_schedule):
+            beta_schedule_fn = beta_schedule
+        elif beta_schedule == 'linear':
             beta_schedule_fn = linear_beta_schedule
         elif beta_schedule == 'cosine':
             beta_schedule_fn = cosine_beta_schedule
@@ -485,12 +516,11 @@ def __init__(
         betas = beta_schedule_fn(timesteps, **schedule_fn_kwargs)
 
         alphas = 1. - betas
-        alphas_cumprod = torch.cumprod(alphas, dim=0)
+        alphas_cumprod = torch.cumprod(alphas, dim = 0)
         alphas_cumprod_prev = F.pad(alphas_cumprod[:-1], (1, 0), value = 1.)
 
         timesteps, = betas.shape
         self.num_timesteps = int(timesteps)
-        self.loss_type = loss_type
 
         # sampling related parameters
 
@@ -511,6 +541,10 @@ def __init__(
         # calculations for diffusion q(x_t | x_{t-1}) and others
 
         register_buffer('sqrt_alphas_cumprod', torch.sqrt(alphas_cumprod))
+
+        terminal_snr = self.sqrt_alphas_cumprod[-1]
+        assert terminal_snr == 0, f'non-zero terminal SNR detected ({terminal_snr:.6f}), from https://arxiv.org/abs/2305.08891 paper - you can wrap your schedule function with `enforce_zero_terminal_snr` decorator'
+
         register_buffer('sqrt_one_minus_alphas_cumprod', torch.sqrt(1. - alphas_cumprod))
         register_buffer('log_one_minus_alphas_cumprod', torch.log(1. - alphas_cumprod))
         register_buffer('sqrt_recip_alphas_cumprod', torch.sqrt(1. / alphas_cumprod))
@@ -725,15 +759,6 @@ def q_sample(self, x_start, t, noise=None):
             extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
         )
 
-    @property
-    def loss_fn(self):
-        if self.loss_type == 'l1':
-            return F.l1_loss
-        elif self.loss_type == 'l2':
-            return F.mse_loss
-        else:
-            raise ValueError(f'invalid loss type {self.loss_type}')
-
     def p_losses(self, x_start, t, noise = None):
         b, c, h, w = x_start.shape
         noise = default(noise, lambda: torch.randn_like(x_start))
@@ -766,7 +791,7 @@ def p_losses(self, x_start, t, noise = None):
         else:
             raise ValueError(f'unknown objective {self.objective}')
 
-        loss = self.loss_fn(model_out, target, reduction = 'none')
+        loss = F.mse_loss(model_out, target, reduction = 'none')
         loss = reduce(loss, 'b ... -> b (...)', 'mean')
 
         loss = loss * extract(self.loss_weight, t, loss.shape)