lucidrains
diff --git a/‎README.md‎
Lines changed: 4 additions & 4 deletions b/‎README.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎denoising_diffusion_pytorch/classifier_free_guidance.py‎
Lines changed: 18 additions & 48 deletions b/‎denoising_diffusion_pytorch/classifier_free_guidance.py‎
Lines changed: 18 additions & 48 deletions
diff --git a/‎denoising_diffusion_pytorch/continuous_time_gaussian_diffusion.py‎
Lines changed: 13 additions & 1 deletion b/‎denoising_diffusion_pytorch/continuous_time_gaussian_diffusion.py‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎denoising_diffusion_pytorch/denoising_diffusion_pytorch.py‎
Lines changed: 20 additions & 45 deletions b/‎denoising_diffusion_pytorch/denoising_diffusion_pytorch.py‎
Lines changed: 20 additions & 45 deletions
@@ -38,7 +38,8 @@ model = Unet(
 diffusion = GaussianDiffusion(
     model,
     image_size = 128,
-    timesteps = 1000    # number of steps
+    timesteps = 1000,   # number of steps
+    loss_type = 'l1'    # L1 or L2
 )
 
 training_images = torch.rand(8, 3, 128, 128) # images are normalized from 0 to 1
@@ -64,7 +65,8 @@ diffusion = GaussianDiffusion(
     model,
     image_size = 128,
     timesteps = 1000,           # number of steps
-    sampling_timesteps = 250    # number of sampling timesteps (using ddim for faster inference [see citation for ddim paper])
+    sampling_timesteps = 250,   # number of sampling timesteps (using ddim for faster inference [see citation for ddim paper])
+    loss_type = 'l1'            # L1 or L2
 )
 
 trainer = Trainer(
@@ -146,11 +148,9 @@ sampled_seq = diffusion.sample(batch_size = 4)
 sampled_seq.shape # (4, 32, 128)
 
 ```
-
 `Trainer1D` does not evaluate the generated samples in any way since the type of data is not known. 
 You could consider adding a suitable metric to the training loop yourself after doing an editable install of this package
 `pip install -e .`.
-
 ## Citations
 
 ```bibtex
 
@@ -2,7 +2,7 @@
 import copy
 from pathlib import Path
 from random import random
-from functools import partial, wraps
+from functools import partial
 from collections import namedtuple
 from multiprocessing import cpu_count
 
@@ -375,7 +375,6 @@ def forward_with_cond_scale(
         self,
         *args,
         cond_scale = 1.,
-        rescale_phi = 0.,
         **kwargs
     ):
         logits = self.forward(*args, cond_drop_prob = 0., **kwargs)
@@ -384,18 +383,7 @@ def forward_with_cond_scale(
             return logits
 
         null_logits = self.forward(*args, cond_drop_prob = 1., **kwargs)
-        scaled_logits = null_logits + (logits - null_logits) * cond_scale
-
-        if rescale_phi <= 0:
-            return scaled_logits
-
-        # rescaling proposed in https://arxiv.org/abs/2305.08891 to prevent over-saturation
-        # works for both pixel and latent space, as opposed to only pixel space with the technique from Imagen
-        # they found 0.7 to work well empirically with a conditional scale of 6.
-
-        std_fn = partial(torch.std, dim = tuple(range(1, scaled_logits.ndim)), keepdim = True)
-        rescaled_logits = scaled_logits * (std_fn(logits) / std_fn(scaled_logits))
-        return rescaled_logits * rescale_phi + (1 - rescale_phi) * scaled_logits
+        return null_logits + (logits - null_logits) * cond_scale
 
     def forward(
         self,
@@ -469,33 +457,6 @@ def extract(a, t, x_shape):
     out = a.gather(-1, t)
     return out.reshape(b, *((1,) * (len(x_shape) - 1)))
 
-def enforce_zero_terminal_snr(schedule_fn):
-    # algorithm 1 in https://arxiv.org/abs/2305.08891
-
-    @wraps(schedule_fn)
-    def inner(*args, **kwargs):
-        betas = schedule_fn(*args, **kwargs)
-        alphas = 1. - betas
-
-        alphas_cumprod = torch.cumprod(alphas, dim = 0)
-        alphas_cumprod = F.pad(alphas_cumprod[:-1], (1, 0), value = 1.)
-
-        alphas_cumprod_sqrt = torch.sqrt(alphas_cumprod)
-
-        terminal_snr = alphas_cumprod_sqrt[-1].clone()
-
-        alphas_cumprod_sqrt -= terminal_snr # enforce zero terminal snr
-        alphas_cumprod_sqrt *= 1. / (1. - terminal_snr)
-
-        alphas_cumprod = alphas_cumprod_sqrt ** 2
-        alphas = alphas_cumprod[1:] / alphas_cumprod[:-1]
-        betas = 1. - alphas
-
-        return betas
-
-    return inner
-
-@enforce_zero_terminal_snr
 def linear_beta_schedule(timesteps):
     scale = 1000 / timesteps
     beta_start = scale * 0.0001
@@ -512,7 +473,7 @@ def cosine_beta_schedule(timesteps, s = 0.008):
     alphas_cumprod = torch.cos(((x / timesteps) + s) / (1 + s) * math.pi * 0.5) ** 2
     alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
     betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
-    return torch.clip(betas, 0, 1.)
+    return torch.clip(betas, 0, 0.999)
 
 class GaussianDiffusion(nn.Module):
     def __init__(
@@ -645,8 +606,8 @@ def q_posterior(self, x_start, x_t, t):
         posterior_log_variance_clipped = extract(self.posterior_log_variance_clipped, t, x_t.shape)
         return posterior_mean, posterior_variance, posterior_log_variance_clipped
 
-    def model_predictions(self, x, t, classes, cond_scale = 6., rescale_phi = 0.7, clip_x_start = False):
-        model_output = self.model.forward_with_cond_scale(x, t, classes, cond_scale = cond_scale, rescale_phi = rescale_phi)
+    def model_predictions(self, x, t, classes, cond_scale = 3., clip_x_start = False):
+        model_output = self.model.forward_with_cond_scale(x, t, classes, cond_scale = cond_scale)
         maybe_clip = partial(torch.clamp, min = -1., max = 1.) if clip_x_start else identity
 
         if self.objective == 'pred_noise':
@@ -678,7 +639,7 @@ def p_mean_variance(self, x, t, classes, cond_scale, clip_denoised = True):
         return model_mean, posterior_variance, posterior_log_variance, x_start
 
     @torch.no_grad()
-    def p_sample(self, x, t: int, classes, cond_scale = 6., rescale_phi = 0.7, clip_denoised = True):
+    def p_sample(self, x, t: int, classes, cond_scale = 3., clip_denoised = True):
         b, *_, device = *x.shape, x.device
         batched_times = torch.full((x.shape[0],), t, device = x.device, dtype = torch.long)
         model_mean, _, model_log_variance, x_start = self.p_mean_variance(x = x, t = batched_times, classes = classes, cond_scale = cond_scale, clip_denoised = clip_denoised)
@@ -687,7 +648,7 @@ def p_sample(self, x, t: int, classes, cond_scale = 6., rescale_phi = 0.7, clip_
         return pred_img, x_start
 
     @torch.no_grad()
-    def p_sample_loop(self, classes, shape, cond_scale = 6., rescale_phi = 0.7):
+    def p_sample_loop(self, classes, shape, cond_scale = 3.):
         batch, device = shape[0], self.betas.device
 
         img = torch.randn(shape, device=device)
@@ -701,7 +662,7 @@ def p_sample_loop(self, classes, shape, cond_scale = 6., rescale_phi = 0.7):
         return img
 
     @torch.no_grad()
-    def ddim_sample(self, classes, shape, cond_scale = 6., rescale_phi = 0.7, clip_denoised = True):
+    def ddim_sample(self, classes, shape, cond_scale = 3., clip_denoised = True):
         batch, device, total_timesteps, sampling_timesteps, eta, objective = shape[0], self.betas.device, self.num_timesteps, self.sampling_timesteps, self.ddim_sampling_eta, self.objective
 
         times = torch.linspace(-1, total_timesteps - 1, steps=sampling_timesteps + 1)   # [-1, 0, 1, 2, ..., T-1] when sampling_timesteps == total_timesteps
@@ -765,6 +726,15 @@ def q_sample(self, x_start, t, noise=None):
             extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
         )
 
+    @property
+    def loss_fn(self):
+        if self.loss_type == 'l1':
+            return F.l1_loss
+        elif self.loss_type == 'l2':
+            return F.mse_loss
+        else:
+            raise ValueError(f'invalid loss type {self.loss_type}')
+
     def p_losses(self, x_start, t, *, classes, noise = None):
         b, c, h, w = x_start.shape
         noise = default(noise, lambda: torch.randn_like(x_start))
@@ -787,7 +757,7 @@ def p_losses(self, x_start, t, *, classes, noise = None):
         else:
             raise ValueError(f'unknown objective {self.objective}')
 
-        loss = F.mse_loss(model_out, target, reduction = 'none')
+        loss = self.loss_fn(model_out, target, reduction = 'none')
         loss = reduce(loss, 'b ... -> b (...)', 'mean')
 
         loss = loss * extract(self.loss_weight, t, loss.shape)
 
@@ -116,6 +116,7 @@ def __init__(
         *,
         image_size,
         channels = 3,
+        loss_type = 'l1',
         noise_schedule = 'linear',
         num_sample_steps = 500,
         clip_sample_denoised = True,
@@ -137,6 +138,8 @@ def __init__(
 
         # continuous noise schedule related stuff
 
+        self.loss_type = loss_type
+
         if noise_schedule == 'linear':
             self.log_snr = beta_linear_log_snr
         elif noise_schedule == 'cosine':
@@ -167,6 +170,15 @@ def __init__(
     def device(self):
         return next(self.model.parameters()).device
 
+    @property
+    def loss_fn(self):
+        if self.loss_type == 'l1':
+            return F.l1_loss
+        elif self.loss_type == 'l2':
+            return F.mse_loss
+        else:
+            raise ValueError(f'invalid loss type {self.loss_type}')
+
     def p_mean_variance(self, x, time, time_next):
         # reviewer found an error in the equation in the paper (missing sigma)
         # following - https://openreview.net/forum?id=2LdBqxc1Yv&noteId=rIQgH0zKsRt
@@ -254,7 +266,7 @@ def p_losses(self, x_start, times, noise = None):
         x, log_snr = self.q_sample(x_start = x_start, times = times, noise = noise)
         model_out = self.model(x, log_snr)
 
-        losses = F.mse_loss(model_out, noise, reduction = 'none')
+        losses = self.loss_fn(model_out, noise, reduction = 'none')
         losses = reduce(losses, 'b ... -> b', 'mean')
 
         if self.min_snr_loss_weight:
 
@@ -2,7 +2,7 @@
 import copy
 from pathlib import Path
 from random import random
-from functools import partial, wraps
+from functools import partial
 from collections import namedtuple
 from multiprocessing import cpu_count
 
@@ -68,11 +68,6 @@ def convert_image_to_fn(img_type, image):
         return image.convert(img_type)
     return image
 
-def extract(a, t, x_shape):
-    b, *_ = t.shape
-    out = a.gather(-1, t)
-    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
-
 # normalization functions
 
 def normalize_to_neg_one_to_one(img):
@@ -403,35 +398,13 @@ def forward(self, x, time, x_self_cond = None):
         x = self.final_res_block(x, t)
         return self.final_conv(x)
 
-# scheduling functions
-
-def enforce_zero_terminal_snr(schedule_fn):
-    # algorithm 1 in https://arxiv.org/abs/2305.08891
-
-    @wraps(schedule_fn)
-    def inner(*args, **kwargs):
-        betas = schedule_fn(*args, **kwargs)
-        alphas = 1. - betas
-
-        alphas_cumprod = torch.cumprod(alphas, dim = 0)
-        alphas_cumprod = F.pad(alphas_cumprod[:-1], (1, 0), value = 1.)
-
-        alphas_cumprod_sqrt = torch.sqrt(alphas_cumprod)
-
-        terminal_snr = alphas_cumprod_sqrt[-1].clone()
-
-        alphas_cumprod_sqrt -= terminal_snr # enforce zero terminal snr
-        alphas_cumprod_sqrt *= 1. / (1. - terminal_snr)
-
-        alphas_cumprod = alphas_cumprod_sqrt ** 2
-        alphas = alphas_cumprod[1:] / alphas_cumprod[:-1]
-        betas = 1. - alphas
-
-        return betas
+# gaussian diffusion trainer class
 
-    return inner
+def extract(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
 
-@enforce_zero_terminal_snr
 def linear_beta_schedule(timesteps):
     """
     linear schedule, proposed in original ddpm paper
@@ -453,7 +426,6 @@ def cosine_beta_schedule(timesteps, s = 0.008):
     betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
     return torch.clip(betas, 0, 1.)
 
-@enforce_zero_terminal_snr
 def sigmoid_beta_schedule(timesteps, start = -3, end = 3, tau = 1, clamp_min = 1e-5):
     """
     sigmoid schedule
@@ -469,8 +441,6 @@ def sigmoid_beta_schedule(timesteps, start = -3, end = 3, tau = 1, clamp_min = 1
     betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
     return torch.clip(betas, 0, 1.)
 
-# gaussian diffusion trainer class
-
 class GaussianDiffusion(nn.Module):
     def __init__(
         self,
@@ -479,6 +449,7 @@ def __init__(
         image_size,
         timesteps = 1000,
         sampling_timesteps = None,
+        loss_type = 'l1',
         objective = 'pred_noise',
         beta_schedule = 'sigmoid',
         schedule_fn_kwargs = dict(),
@@ -502,9 +473,7 @@ def __init__(
 
         assert objective in {'pred_noise', 'pred_x0', 'pred_v'}, 'objective must be either pred_noise (predict noise) or pred_x0 (predict image start) or pred_v (predict v [v-parameterization as defined in appendix D of progressive distillation paper, used in imagen-video successfully])'
 
-        if callable(beta_schedule):
-            beta_schedule_fn = beta_schedule
-        elif beta_schedule == 'linear':
+        if beta_schedule == 'linear':
             beta_schedule_fn = linear_beta_schedule
         elif beta_schedule == 'cosine':
             beta_schedule_fn = cosine_beta_schedule
@@ -516,11 +485,12 @@ def __init__(
         betas = beta_schedule_fn(timesteps, **schedule_fn_kwargs)
 
         alphas = 1. - betas
-        alphas_cumprod = torch.cumprod(alphas, dim = 0)
+        alphas_cumprod = torch.cumprod(alphas, dim=0)
         alphas_cumprod_prev = F.pad(alphas_cumprod[:-1], (1, 0), value = 1.)
 
         timesteps, = betas.shape
         self.num_timesteps = int(timesteps)
+        self.loss_type = loss_type
 
         # sampling related parameters
 
@@ -541,10 +511,6 @@ def __init__(
         # calculations for diffusion q(x_t | x_{t-1}) and others
 
         register_buffer('sqrt_alphas_cumprod', torch.sqrt(alphas_cumprod))
-
-        terminal_snr = self.sqrt_alphas_cumprod[-1]
-        assert terminal_snr == 0, f'non-zero terminal SNR detected ({terminal_snr:.6f}), from https://arxiv.org/abs/2305.08891 paper - you can wrap your schedule function with `enforce_zero_terminal_snr` decorator'
-
         register_buffer('sqrt_one_minus_alphas_cumprod', torch.sqrt(1. - alphas_cumprod))
         register_buffer('log_one_minus_alphas_cumprod', torch.log(1. - alphas_cumprod))
         register_buffer('sqrt_recip_alphas_cumprod', torch.sqrt(1. / alphas_cumprod))
@@ -759,6 +725,15 @@ def q_sample(self, x_start, t, noise=None):
             extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
         )
 
+    @property
+    def loss_fn(self):
+        if self.loss_type == 'l1':
+            return F.l1_loss
+        elif self.loss_type == 'l2':
+            return F.mse_loss
+        else:
+            raise ValueError(f'invalid loss type {self.loss_type}')
+
     def p_losses(self, x_start, t, noise = None):
         b, c, h, w = x_start.shape
         noise = default(noise, lambda: torch.randn_like(x_start))
@@ -791,7 +766,7 @@ def p_losses(self, x_start, t, noise = None):
         else:
             raise ValueError(f'unknown objective {self.objective}')
 
-        loss = F.mse_loss(model_out, target, reduction = 'none')
+        loss = self.loss_fn(model_out, target, reduction = 'none')
         loss = reduce(loss, 'b ... -> b (...)', 'mean')
 
         loss = loss * extract(self.loss_weight, t, loss.shape)