update

DN6 · DN6 · commit 900feadbc9b6 · 2024-11-26T09:12:17.000+01:00
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -3554,11 +3554,11 @@ def __call__(
         if image_rotary_emb is not None:
 
             def apply_rotary_emb(x, freqs_cos, freqs_sin):
-                x_even = x[..., 0::2]
-                x_odd = x[..., 1::2]
+                x_even = x[..., 0::2].float()
+                x_odd = x[..., 1::2].float()
 
-                cos = (x_even * freqs_cos.float() - x_odd * freqs_sin.float()).to(x.dtype)
-                sin = (x_even * freqs_sin.float() + x_odd * freqs_cos.float()).to(x.dtype)
+                cos = (x_even * freqs_cos - x_odd * freqs_sin).to(x.dtype)
+                sin = (x_even * freqs_sin + x_odd * freqs_cos).to(x.dtype)
 
                 return torch.stack([cos, sin], dim=-1).flatten(-2)
 
@@ -3572,40 +3572,23 @@ def apply_rotary_emb(x, freqs_cos, freqs_sin):
             encoder_value.transpose(1, 2),
         )
 
-        batch_size, heads, sequence_length, dim = query.shape
-        encoder_sequence_length = encoder_query.shape[2]
-        total_length = sequence_length + encoder_sequence_length
+        sequence_length = query.size(2)
+        encoder_sequence_length = encoder_query.size(2)
 
         query = torch.cat([query, encoder_query], dim=2)
         key = torch.cat([key, encoder_key], dim=2)
         value = torch.cat([value, encoder_value], dim=2)
 
         # Zero out tokens based on the attention mask
-        # query = query * attention_mask[:, None, :, None]
-        # key = key * attention_mask[:, None, :, None]
-        # value = value * attention_mask[:, None, :, None]
+        query = query * attention_mask[:, None, :, None]
+        key = key * attention_mask[:, None, :, None]
+        value = value * attention_mask[:, None, :, None]
 
-        query = query.view(1, query.size(1), -1, query.size(-1))
-        key = key.view(1, key.size(1), -1, key.size(-1))
-        value = value.view(1, value.size(1), -1, key.size(-1))
-
-        select_index = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-
-        query = torch.index_select(query, 2, select_index)
-        key = torch.index_select(key, 2, select_index)
-        value = torch.index_select(value, 2, select_index)
-
-        from torch.nn.attention import SDPBackend, sdpa_kernel
-
-        with sdpa_kernel([SDPBackend.EFFICIENT_ATTENTION]):
-            hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
+        hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
 
-        hidden_states = hidden_states.transpose(1, 2).flatten(2, 3).squeeze(0)
-        output = torch.zeros(
-            batch_size * total_length, dim * heads, device=hidden_states.device, dtype=hidden_states.dtype
-        )
-        output.scatter_(0, select_index.unsqueeze(1).expand(-1, dim * heads), hidden_states)
-        hidden_states = output.view(batch_size, total_length, dim * heads)
+        hidden_states = hidden_states.transpose(1, 2).flatten(2, 3)
+        # Zero out tokens based on attention mask
+        hidden_states = hidden_states * attention_mask[:, :, None]
 
         hidden_states, encoder_hidden_states = hidden_states.split_with_sizes(
             (sequence_length, encoder_sequence_length), dim=1
diff --git a/src/diffusers/pipelines/mochi/pipeline_mochi.py b/src/diffusers/pipelines/mochi/pipeline_mochi.py
@@ -21,7 +21,7 @@
 from transformers import T5EncoderModel, T5TokenizerFast
 
 from ...callbacks import MultiPipelineCallbacks, PipelineCallback
-from ...models.autoencoders import AutoencoderKLMochi
+from ...models.autoencoders import AutoencoderKL
 from ...models.transformers import MochiTransformer3DModel
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import (
@@ -56,7 +56,7 @@
         >>> pipe.enable_model_cpu_offload()
         >>> pipe.enable_vae_tiling()
         >>> prompt = "Close-up of a chameleon's eye, with its scaly skin changing color. Ultra high resolution 4k."
-        >>> frames = pipe(prompt, num_inference_steps=50, guidance_scale=3.5).frames[0]
+        >>> frames = pipe(prompt, num_inference_steps=28, guidance_scale=3.5).frames[0]
         >>> export_to_video(frames, "mochi.mp4")
         ```
 """
@@ -164,8 +164,8 @@ class MochiPipeline(DiffusionPipeline):
             Conditional Transformer architecture to denoise the encoded video latents.
         scheduler ([`FlowMatchEulerDiscreteScheduler`]):
             A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
-        vae ([`AutoencoderKLMochi`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
         text_encoder ([`T5EncoderModel`]):
             [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
             the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
@@ -184,7 +184,7 @@ class MochiPipeline(DiffusionPipeline):
     def __init__(
         self,
         scheduler: FlowMatchEulerDiscreteScheduler,
-        vae: AutoencoderKLMochi,
+        vae: AutoencoderKL,
         text_encoder: T5EncoderModel,
         tokenizer: T5TokenizerFast,
         transformer: MochiTransformer3DModel,
@@ -198,11 +198,17 @@ def __init__(
             transformer=transformer,
             scheduler=scheduler,
         )
-
-        self.vae_scale_factor_spatial = vae.spatial_compression_ratio if hasattr(self, "vae") else 8
-        self.vae_scale_factor_temporal = vae.temporal_compression_ratio if hasattr(self, "vae") else 6
-
-        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+        # TODO: determine these scaling factors from model parameters
+        self.vae_spatial_scale_factor = 8
+        self.vae_temporal_scale_factor = 6
+        self.patch_size = 2
+
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_spatial_scale_factor)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_height = 480
+        self.default_width = 848
 
     # Adapted from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._get_t5_prompt_embeds
     def _get_t5_prompt_embeds(
@@ -253,6 +259,14 @@ def _get_t5_prompt_embeds(
 
         return prompt_embeds, prompt_attention_mask
 
+    def prepare_joint_attention_mask(self, prompt_attention_mask, latents):
+        batch_size, channels, latent_frames, latent_height, latent_width = latents.shape
+        num_latents = latent_frames * latent_height * latent_width
+        num_visual_tokens = num_latents // (self.transformer.config.patch_size**2)
+        mask = F.pad(prompt_attention_mask, (num_visual_tokens, 0), value=True)
+
+        return mask
+
     # Adapted from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.encode_prompt
     def encode_prompt(
         self,
@@ -335,12 +349,7 @@ def encode_prompt(
                 dtype=dtype,
             )
 
-        return (
-            prompt_embeds,
-            prompt_attention_mask,
-            negative_prompt_embeds,
-            negative_prompt_attention_mask,
-        )
+        return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask
 
     def check_inputs(
         self,
@@ -424,13 +433,6 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-    def prepare_joint_attention_mask(self, prompt_attention_mask, latents):
-        batch_size, channels, latent_frames, latent_height, latent_width = latents.shape
-        num_latents = latent_frames * latent_height * latent_width
-        num_visual_tokens = num_latents // (self.transformer.config.patch_size**2)
-        mask = F.pad(prompt_attention_mask, (num_visual_tokens, 0), value=True)
-        return mask
-
     def prepare_latents(
         self,
         batch_size,
@@ -443,9 +445,9 @@ def prepare_latents(
         generator,
         latents=None,
     ):
-        height = height // self.vae_scale_factor_spatial
-        width = width // self.vae_scale_factor_spatial
-        num_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        height = height // self.vae_spatial_scale_factor
+        width = width // self.vae_spatial_scale_factor
+        num_frames = (num_frames - 1) // self.vae_temporal_scale_factor + 1
 
         shape = (batch_size, num_channels_latents, num_frames, height, width)
 
@@ -485,7 +487,7 @@ def __call__(
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_frames: int = 19,
-        num_inference_steps: int = 50,
+        num_inference_steps: int = 64,
         timesteps: List[int] = None,
         guidance_scale: float = 4.5,
         num_videos_per_prompt: Optional[int] = 1,
@@ -508,13 +510,13 @@ def __call__(
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                 instead.
-            height (`int`, *optional*, defaults to `self.transformer.config.sample_height * self.vae.spatial_compression_ratio`):
+            height (`int`, *optional*, defaults to `self.default_height`):
                 The height in pixels of the generated image. This is set to 480 by default for the best results.
-            width (`int`, *optional*, defaults to `self.transformer.config.sample_width * self.vae.spatial_compression_ratio`):
+            width (`int`, *optional*, defaults to `self.default_width`):
                 The width in pixels of the generated image. This is set to 848 by default for the best results.
             num_frames (`int`, defaults to `19`):
                 The number of video frames to generate
-            num_inference_steps (`int`, *optional*, defaults to `50`):
+            num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference.
             timesteps (`List[int]`, *optional*):
@@ -574,8 +576,8 @@ def __call__(
         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
 
-        height = height or 480  # self.transformer.config.sample_height * self.vae_scaling_factor_spatial
-        width = width or 848  # self.transformer.config.sample_width * self.vae_scaling_factor_spatial
+        height = height or self.default_height
+        width = width or self.default_width
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(
@@ -601,6 +603,7 @@ def __call__(
             batch_size = prompt_embeds.shape[0]
 
         device = self._execution_device
+
         # 3. Prepare text embeddings
         (
             prompt_embeds,
@@ -619,10 +622,6 @@ def __call__(
             max_sequence_length=max_sequence_length,
             device=device,
         )
-        # if self.do_classifier_free_guidance:
-        #     prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
-        #     prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
-
         # 4. Prepare latent variables
         num_channels_latents = self.transformer.config.in_channels
         latents = self.prepare_latents(
@@ -636,16 +635,20 @@ def __call__(
             generator,
             latents,
         )
+        joint_attention_mask = self.prepare_joint_attention_mask(prompt_attention_mask, latents)
+        negative_joint_attention_mask = self.prepare_joint_attention_mask(negative_prompt_attention_mask, latents)
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+            joint_attention_mask = torch.cat([negative_joint_attention_mask, joint_attention_mask], dim=0)
 
         # 5. Prepare timestep
         # from https://github.com/genmoai/models/blob/075b6e36db58f1242921deff83a1066887b9c9e1/src/mochi_preview/infer.py#L77
         threshold_noise = 0.025
         sigmas = linear_quadratic_schedule(num_inference_steps, threshold_noise)
         sigmas = np.array(sigmas)
 
-        joint_attention_mask = self.prepare_joint_attention_mask(prompt_attention_mask, latents)
-        negative_joint_attention_mask = self.prepare_joint_attention_mask(negative_prompt_attention_mask, latents)
-
         timesteps, num_inference_steps = retrieve_timesteps(
             self.scheduler,
             num_inference_steps,
@@ -662,40 +665,28 @@ def __call__(
                 if self.interrupt:
                     continue
 
-                # latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
-                # # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-                # timestep = t.expand(latent_model_input.shape[0]).to(latents.dtype)
-
-                latent_model_input = latents
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
                 timestep = t.expand(latent_model_input.shape[0]).to(latents.dtype)
 
-                noise_pred_text = self.transformer(
+                noise_pred = self.transformer(
                     hidden_states=latent_model_input,
                     encoder_hidden_states=prompt_embeds,
                     timestep=timestep,
                     encoder_attention_mask=prompt_attention_mask,
                     joint_attention_mask=joint_attention_mask,
                     return_dict=False,
                 )[0]
+                # Mochi CFG + Sampling runs in FP32
+                noise_pred = noise_pred.to(torch.float32)
 
                 if self.do_classifier_free_guidance:
-                    noise_pred_uncond = self.transformer(
-                        hidden_states=latent_model_input,
-                        encoder_hidden_states=negative_prompt_embeds,
-                        timestep=timestep,
-                        encoder_attention_mask=negative_prompt_attention_mask,
-                        joint_attention_mask=negative_joint_attention_mask,
-                        return_dict=False,
-                    )[0]
-                    noise_pred = noise_pred_uncond.float() + self.guidance_scale * (
-                        noise_pred_text.float() - noise_pred_uncond.float()
-                    )
-                else:
-                    noise_pred = noise_pred_text
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
                 latents_dtype = latents.dtype
-                latents = self.scheduler.step(noise_pred, t, latents.float(), return_dict=False)[0]
+                latents = self.scheduler.step(noise_pred, t, latents.to(torch.float32), return_dict=False)[0]
                 latents = latents.to(latents_dtype)
 
                 if latents.dtype != latents_dtype:
@@ -718,33 +709,27 @@ def __call__(
 
                 if XLA_AVAILABLE:
                     xm.mark_step()
+
         if output_type == "latent":
             video = latents
         else:
-            with torch.autocast("cuda", torch.float32):
-                # unscale/denormalize the latents
-                # denormalize with the mean and std if available and not None
-                has_latents_mean = (
-                    hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 12, 1, 1, 1).to(latents.device, latents.dtype)
                 )
-                has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
-                if has_latents_mean and has_latents_std:
-                    latents_mean = (
-                        torch.tensor(self.vae.config.latents_mean)
-                        .view(1, 12, 1, 1, 1)
-                        .to(latents.device, latents.dtype)
-                    )
-                    latents_std = (
-                        torch.tensor(self.vae.config.latents_std)
-                        .view(1, 12, 1, 1, 1)
-                        .to(latents.device, latents.dtype)
-                    )
-                    latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
-                else:
-                    latents = latents / self.vae.config.scaling_factor
-
-                video = self.vae.decode(latents, return_dict=False)[0]
-                video = self.video_processor.postprocess_video(video, output_type=output_type)
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 12, 1, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+
+            video = self.vae.decode(latents, return_dict=False)[0]
+            video = self.video_processor.postprocess_video(video, output_type=output_type)
 
         # Offload all models
         self.maybe_free_model_hooks()