revert to having n_targets as a pipeline property

toshas · toshas · commit b3d1152357f4 · 2025-02-25T00:12:46.000+01:00
diff --git a/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py b/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py
@@ -203,6 +203,10 @@ def __init__(
 
         self.image_processor = MarigoldImageProcessor(vae_scale_factor=self.vae_scale_factor)
 
+    @property
+    def n_targets(self):
+        return self.unet.config.out_channels // self.vae.config.latent_channels
+
     def check_inputs(
         self,
         image: PipelineImageInput,
@@ -550,9 +554,8 @@ def __call__(
         # 6. Decode predictions from latent into pixel space. The resulting `N * E` predictions have shape `(PPH, PPW)`,
         # which requires slight postprocessing. Decoding into pixel space happens in batches of size `batch_size`.
         # Model invocation: self.vae.decoder.
-        n_targets = self.unet.config.out_channels // self.vae.config.latent_channels
         pred_latent_for_decoding = pred_latent.reshape(
-            num_images * ensemble_size * n_targets, self.vae.config.latent_channels, *pred_latent.shape[2:]
+            num_images * ensemble_size * self.n_targets, self.vae.config.latent_channels, *pred_latent.shape[2:]
         )  # [N*E*T,4,PPH,PPW]
         prediction = torch.cat(
             [
@@ -577,7 +580,7 @@ def __call__(
         uncertainty = None
         if ensemble_size > 1:
             prediction = prediction.reshape(
-                num_images, ensemble_size, n_targets, *prediction.shape[1:]
+                num_images, ensemble_size, self.n_targets, *prediction.shape[1:]
             )  # [N,E,T,3,PH,PW]
             prediction = [
                 self.ensemble_intrinsics(prediction[i], output_uncertainty, **(ensembling_kwargs or {}))
@@ -650,9 +653,8 @@ def retrieve_latents(encoder_output):
 
         pred_latent = latents
         if pred_latent is None:
-            n_targets = self.unet.config.out_channels // self.vae.config.latent_channels
             pred_latent = randn_tensor(
-                (N_E, n_targets * C, H, W),
+                (N_E, self.n_targets * C, H, W),
                 generator=generator,
                 device=image_latent.device,
                 dtype=image_latent.dtype,