update

DN6 · DN6 · commit 11ce6b8791ab · 2024-12-07T08:48:14.000+01:00
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
@@ -1597,9 +1597,9 @@ def pool_tokens(x: torch.Tensor, mask: torch.Tensor, *, keepdim=False) -> torch.
         input_dtype = x.dtype
         assert x.size(1) == mask.size(1)  # Expected mask to have same length as tokens.
         assert x.size(0) == mask.size(0)  # Expected mask to have same batch size as tokens.
-        mask = mask[:, :, None].to(dtype=torch.float32)
+        mask = mask[:, :, None].to(dtype=x.dtype)
         mask = mask / mask.sum(dim=1, keepdim=True).clamp(min=1)
-        pooled = (x.to(torch.float32) * mask).sum(dim=1, keepdim=keepdim)
+        pooled = (x * mask).sum(dim=1, keepdim=keepdim)
         return pooled.to(input_dtype)
 
     def forward(self, x: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor:
diff --git a/src/diffusers/models/transformers/transformer_mochi.py b/src/diffusers/models/transformers/transformer_mochi.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import numbers
 from typing import Any, Dict, Optional, Tuple
 
 import torch
@@ -54,6 +55,34 @@ def forward(self, hidden_states, scale=None):
         return hidden_states
 
 
+class MochiRMSNorm(nn.Module):
+    def __init__(self, dim, eps: float, elementwise_affine: bool = True):
+        super().__init__()
+
+        self.eps = eps
+
+        if isinstance(dim, numbers.Integral):
+            dim = (dim,)
+
+        self.dim = torch.Size(dim)
+
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim))
+        else:
+            self.weight = None
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+
+        if self.weight is not None:
+            hidden_states = hidden_states * self.weight
+        hidden_states = hidden_states.to(input_dtype)
+
+        return hidden_states
+
+
 class MochiLayerNormContinuous(nn.Module):
     def __init__(
         self,
@@ -139,10 +168,10 @@ def __init__(
 
         self.heads = out_dim // dim_head if out_dim is not None else heads
 
-        self.norm_q = RMSNorm(dim_head, eps, True)
-        self.norm_k = RMSNorm(dim_head, eps, True)
-        self.norm_added_q = RMSNorm(dim_head, eps, True)
-        self.norm_added_k = RMSNorm(dim_head, eps, True)
+        self.norm_q = MochiRMSNorm(dim_head, eps, True)
+        self.norm_k = MochiRMSNorm(dim_head, eps, True)
+        self.norm_added_q = MochiRMSNorm(dim_head, eps, True)
+        self.norm_added_k = MochiRMSNorm(dim_head, eps, True)
 
         self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias)
         self.to_k = nn.Linear(query_dim, self.inner_dim, bias=bias)