-einops

hlky · hlky · commit a40c95f24085 · 2025-04-10T14:04:17.000+01:00
diff --git a/src/diffusers/models/transformers/transformer_hidream_image.py b/src/diffusers/models/transformers/transformer_hidream_image.py
@@ -1,7 +1,6 @@
 import math
 from typing import Any, Dict, List, Optional, Tuple
 
-import einops
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -756,21 +755,26 @@ def expand_timesteps(self, timesteps, batch_size, device):
 
     def unpatchify(self, x: torch.Tensor, img_sizes: List[Tuple[int, int]], is_training: bool) -> List[torch.Tensor]:
         if is_training:
-            x = einops.rearrange(
-                x, "B S (p1 p2 C) -> B C S (p1 p2)", p1=self.config.patch_size, p2=self.config.patch_size
+            B, S, F = x.shape
+            C = F // (self.config.patch_size * self.config.patch_size)
+            x = (
+                x.reshape(B, S, self.config.patch_size, self.config.patch_size, C)
+                .permute(0, 4, 1, 2, 3)
+                .reshape(B, C, S, self.config.patch_size * self.config.patch_size)
             )
         else:
             x_arr = []
+            p1 = self.config.patch_size
+            p2 = self.config.patch_size
             for i, img_size in enumerate(img_sizes):
                 pH, pW = img_size
-                x_arr.append(
-                    einops.rearrange(
-                        x[i, : pH * pW].reshape(1, pH, pW, -1),
-                        "B H W (p1 p2 C) -> B C (H p1) (W p2)",
-                        p1=self.config.patch_size,
-                        p2=self.config.patch_size,
-                    )
-                )
+                t = x[i, : pH * pW].reshape(1, pH, pW, -1)
+                F_token = t.shape[-1]
+                C = F_token // (p1 * p2)
+                t = t.reshape(1, pH, pW, p1, p2, C)
+                t = t.permute(0, 5, 1, 3, 2, 4)
+                t = t.reshape(1, C, pH * p1, pW * p2)
+                x_arr.append(t)
             x = torch.cat(x_arr, dim=0)
         return x
 
@@ -789,12 +793,14 @@ def patchify(self, x, max_seq, img_sizes=None):
         if img_sizes is not None:
             for i, img_size in enumerate(img_sizes):
                 x_masks[i, 0 : img_size[0] * img_size[1]] = 1
-            x = einops.rearrange(x, "B C S p -> B S (p C)", p=pz2, C=C)
+            B, C, S, _ = x.shape
+            x = x.permute(0, 2, 3, 1).reshape(B, S, pz2 * C)
         elif isinstance(x, torch.Tensor):
-            pH, pW = x.shape[-2] // self.config.patch_size, x.shape[-1] // self.config.patch_size
-            x = einops.rearrange(
-                x, "B C (H p1) (W p2) -> B (H W) (p1 p2 C)", p1=self.config.patch_size, p2=self.config.patch_size, C=C
-            )
+            B, C, Hp1, Wp2 = x.shape
+            pH, pW = Hp1 // self.config.patch_size, Wp2 // self.config.patch_size
+            x = x.reshape(B, C, pH, self.config.patch_size, pW, self.config.patch_size)
+            x = x.permute(0, 2, 4, 3, 5, 1)
+            x = x.reshape(B, pH * pW, self.config.patch_size * self.config.patch_size * C)
             img_sizes = [[pH, pW]] * B
             x_masks = None
         else:
diff --git a/src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py b/src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py
@@ -2,7 +2,6 @@
 import math
 from typing import Any, Callable, Dict, List, Optional, Union
 
-import einops
 import torch
 from transformers import (
     CLIPTextModelWithProjection,
@@ -679,9 +678,9 @@ def __call__(
                         dtype=latent_model_input.dtype,
                         device=latent_model_input.device,
                     )
-                    latent_model_input = einops.rearrange(
-                        latent_model_input, "B C (H p1) (W p2) -> B C (H W) (p1 p2)", p1=patch_size, p2=patch_size
-                    )
+                    latent_model_input = latent_model_input.reshape(B, C, pH, patch_size, pW, patch_size)
+                    latent_model_input = latent_model_input.permute(0, 1, 2, 4, 3, 5)
+                    latent_model_input = latent_model_input.reshape(B, C, pH * pW, patch_size * patch_size)
                     out[:, :, 0 : pH * pW] = latent_model_input
                     latent_model_input = out