fix image processing when input is tensor

linoytsaban · linoytsaban · commit 0580379cc2bc · 2025-05-26T17:17:53.000+03:00
diff --git a/src/diffusers/pipelines/sana/pipeline_sana_sprint_img2img.py b/src/diffusers/pipelines/sana/pipeline_sana_sprint_img2img.py
@@ -18,6 +18,7 @@
 import urllib.parse as ul
 import warnings
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import torch.nn.functional as F
 
 import torch
 from transformers import Gemma2PreTrainedModel, GemmaTokenizer, GemmaTokenizerFast
@@ -579,14 +580,22 @@ def _clean_caption(self, caption):
 
     def prepare_image(
         self,
-        image,
-        width,
-        height,
-        device,
-        dtype,
+        image: PipelineImageInput,
+        width: int,
+        height: int,
+        device: torch.device,
+        dtype: torch.dtype,
     ):
         if isinstance(image, torch.Tensor):
-            pass
+            if image.ndim == 3:
+                image = image.unsqueeze(0)
+            # Resize if current dimensions do not match target dimensions.
+            if image.shape[2] != height or image.shape[3] != width:
+                image = F.interpolate(image, size=(height, width), mode="bilinear",
+                                              align_corners=False)
+
+            image = self.image_processor.preprocess(image, height=height, width=width)
+
         else:
             image = self.image_processor.preprocess(image, height=height, width=width)