some changes needed to use gaussian diffusion 1d for large behavioral models

lucidrains · lucidrains · commit 34b430d96e23 · 2025-07-25T08:53:12.000-07:00
diff --git a/denoising_diffusion_pytorch/denoising_diffusion_pytorch_1d.py b/denoising_diffusion_pytorch/denoising_diffusion_pytorch_1d.py
@@ -423,12 +423,18 @@ def __init__(
         objective = 'pred_noise',
         beta_schedule = 'cosine',
         ddim_sampling_eta = 0.,
-        auto_normalize = True
+        auto_normalize = True,
+        channels = None,
+        self_condition = None,
+        channel_first = True
     ):
         super().__init__()
         self.model = model
-        self.channels = self.model.channels
-        self.self_condition = self.model.self_condition
+        self.channels = default(channels, lambda: self.model.channels)
+        self.self_condition = default(self_condition, lambda: self.model.self_condition)
+
+        self.channel_first = channel_first
+        self.seq_index = -2 if not channel_first else -1
 
         self.seq_length = seq_length
 
@@ -638,7 +644,9 @@ def ddim_sample(self, shape, clip_denoised = True):
     def sample(self, batch_size = 16):
         seq_length, channels = self.seq_length, self.channels
         sample_fn = self.p_sample_loop if not self.is_ddim_sampling else self.ddim_sample
-        return sample_fn((batch_size, channels, seq_length))
+
+        shape = (batch_size, channels, seq_length) if self.channel_first else (batch_size, seq_length, channels)
+        return sample_fn(shape)
 
     @torch.no_grad()
     def interpolate(self, x1, x2, t = None, lam = 0.5):
@@ -669,8 +677,10 @@ def q_sample(self, x_start, t, noise=None):
             extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
         )
 
-    def p_losses(self, x_start, t, noise = None):
-        b, c, n = x_start.shape
+    def p_losses(self, x_start, t, noise = None, model_forward_kwargs: dict = dict()):
+        b = x_start.shape[0]
+        n = x_start.shape[self.seq_index]
+
         noise = default(noise, lambda: torch.randn_like(x_start))
 
         # noise sample
@@ -687,9 +697,13 @@ def p_losses(self, x_start, t, noise = None):
                 x_self_cond = self.model_predictions(x, t).pred_x_start
                 x_self_cond.detach_()
 
+            model_forward_kwargs = {**model_forward_kwargs, 'self_cond': x_self_cond}
+
+        # model kwargs
+
         # predict and take gradient step
 
-        model_out = self.model(x, t, x_self_cond)
+        model_out = self.model(x, t, **model_forward_kwargs)
 
         if self.objective == 'pred_noise':
             target = noise
@@ -708,7 +722,8 @@ def p_losses(self, x_start, t, noise = None):
         return loss.mean()
 
     def forward(self, img, *args, **kwargs):
-        b, c, n, device, seq_length, = *img.shape, img.device, self.seq_length
+        b, n, device, seq_length, = img.shape[0], img.shape[self.seq_index], img.device, self.seq_length
+
         assert n == seq_length, f'seq length must be {seq_length}'
         t = torch.randint(0, self.num_timesteps, (b,), device=device).long()
 
diff --git a/denoising_diffusion_pytorch/version.py b/denoising_diffusion_pytorch/version.py
@@ -1 +1 @@
-__version__ = '2.1.1'
+__version__ = '2.2.0'

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = '2.1.1'`
	`1`	`+__version__ = '2.2.0'`