add CacheMixin

zRzRzRzRzRzRzR · zRzRzRzRzRzRzR · commit 44bfd4c82779 · 2025-02-18T16:01:01.000+08:00
diff --git a/examples/cogview4-control/train_control_cogview4.py b/examples/cogview4-control/train_control_cogview4.py
@@ -127,7 +127,6 @@ def log_validation(cogview4_transformer, args, accelerator, weight_dtype, step,
                     num_inference_steps=50,
                     guidance_scale=args.guidance_scale,
                     generator=generator,
-                    max_sequence_length=512,
                     height=args.resolution,
                     width=args.resolution,
                 ).images[0]
@@ -1075,7 +1074,7 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
                 mu = torch.sqrt(image_seq_lens / 256)
                 mu = mu * 0.75 + 0.25
                 scale_factors = mu / (mu + (1 / sigmas - 1) ** 1.0).to(dtype=pixel_latents.dtype, device=pixel_latents.device)
-                scale_factors = scale_factors.view(4, 1, 1, 1)
+                scale_factors = scale_factors.view(len(batch["captions"]), 1, 1, 1)
                 noisy_model_input = (1.0 - scale_factors) * pixel_latents + scale_factors * noise
                 concatenated_noisy_model_input = torch.cat([noisy_model_input, control_latents], dim=1)
                 text_encoding_pipeline = text_encoding_pipeline.to("cuda")
@@ -1114,7 +1113,7 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
                 # flow-matching loss
                 target = noise - pixel_latents
 
-                weighting = weighting.unsqueeze(1).unsqueeze(2).unsqueeze(3)  # [4, 1, 1, 1]
+                weighting = weighting.view(len(batch["captions"]), 1, 1, 1)
                 loss = torch.mean((weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),1)
                 loss = loss.mean()
                 accelerator.backward(loss)
diff --git a/src/diffusers/models/transformers/transformer_cogview4.py b/src/diffusers/models/transformers/transformer_cogview4.py
@@ -17,13 +17,14 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-
+from ...loaders import PeftAdapterMixin
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...models.attention import FeedForward
 from ...models.attention_processor import Attention
 from ...models.modeling_utils import ModelMixin
 from ...models.normalization import AdaLayerNormContinuous
 from ...utils import logging
+from ..cache_utils import CacheMixin
 from ..embeddings import CogView3CombinedTimestepSizeEmbeddings
 from ..modeling_outputs import Transformer2DModelOutput
 
@@ -285,6 +286,7 @@ def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tens
 
 
 class CogView4Transformer2DModel(ModelMixin, ConfigMixin):
+class CogView4Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, CacheMixin):
     r"""
     Args:
         patch_size (`int`, defaults to `2`):
@@ -390,7 +392,6 @@ def forward(
         p = self.config.patch_size
         post_patch_height = height // p
         post_patch_width = width // p
-
         hidden_states, encoder_hidden_states = self.patch_embed(hidden_states, encoder_hidden_states)
 
         temb = self.time_condition_embed(timestep, original_size, target_size, crop_coords, hidden_states.dtype)