address reviews

a-r-r-o-w · a-r-r-o-w · commit f35850c0d989 · 2024-10-09T15:35:33.000+02:00
diff --git a/docs/source/en/api/pipelines/cogview3.md b/docs/source/en/api/pipelines/cogview3.md
@@ -13,7 +13,7 @@
 # limitations under the License.
 -->
 
-# CogVideoX
+# CogView3Plus
 
 [CogView3: Finer and Faster Text-to-Image Generation via Relay Diffusion](https://huggingface.co/papers/2403.05121) from Tsinghua University & ZhipuAI, by Wendi Zheng, Jiayan Teng, Zhuoyi Yang, Weihan Wang, Jidong Chen, Xiaotao Gu, Yuxiao Dong, Ming Ding, Jie Tang.
 
@@ -29,45 +29,6 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.m
 
 This pipeline was contributed by [zRzRzRzRzRzRzR](https://github.com/zRzRzRzRzRzRzR). The original codebase can be found [here](https://huggingface.co/THUDM). The original weights can be found under [hf.co/THUDM](https://huggingface.co/THUDM).
 
-## Inference
-
-Use [`torch.compile`](https://huggingface.co/docs/diffusers/main/en/tutorials/fast_diffusion#torchcompile) to reduce the inference latency.
-
-First, load the pipeline:
-
-```python
-import torch
-from diffusers import CogView3PlusPipeline
-from diffusers.utils import export_to_video,load_image
-
-pipe = CogView3PlusPipeline.from_pretrained("THUDM/CogView3Plus-3b").to("cuda") # or "THUDM/CogVideoX-2b" 
-```
-
-Then change the memory layout of the `transformer` and `vae` components to `torch.channels_last`:
-
-```python
-pipe.transformer.to(memory_format=torch.channels_last)
-pipe.vae.to(memory_format=torch.channels_last)
-```
-
-Compile the components and run inference:
-
-```python
-pipe.transformer = torch.compile(pipeline.transformer, mode="max-autotune", fullgraph=True)
-pipe.vae.decode = torch.compile(pipeline.vae.decode, mode="max-autotune", fullgraph=True)
-
-# CogVideoX works well with long and well-described prompts
-prompt = "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical performance."
-video = pipe(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
-```
-
-The [benchmark](TODO) results on an 80GB A100 machine are:
-
-```
-Without torch.compile(): Average inference time: TODO seconds.
-With torch.compile(): Average inference time: TODO seconds.
-```
-
 ## CogView3PlusPipeline
 
 [[autodoc]] CogView3PlusPipeline
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
@@ -442,6 +442,60 @@ def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor):
         return embeds
 
 
+class CogView3PlusPatchEmbed(nn.Module):
+    def __init__(
+        self,
+        in_channels: int = 16,
+        hidden_size: int = 2560,
+        patch_size: int = 2,
+        text_hidden_size: int = 4096,
+        pos_embed_max_size: int = 128,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_size = hidden_size
+        self.patch_size = patch_size
+        self.text_hidden_size = text_hidden_size
+        self.pos_embed_max_size = pos_embed_max_size
+        # Linear projection for image patches
+        self.proj = nn.Linear(in_channels * patch_size**2, hidden_size)
+
+        # Linear projection for text embeddings
+        self.text_proj = nn.Linear(text_hidden_size, hidden_size)
+
+        pos_embed = get_2d_sincos_pos_embed(hidden_size, pos_embed_max_size, base_size=pos_embed_max_size)
+        pos_embed = pos_embed.reshape(pos_embed_max_size, pos_embed_max_size, hidden_size)
+        self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float(), persistent=False)
+
+    def forward(self, hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, channel, height, width = hidden_states.shape
+        
+        if height % self.patch_size != 0 or width % self.patch_size != 0:
+            raise ValueError("Height and width must be divisible by patch size")
+        
+        height = height // self.patch_size
+        width = width // self.patch_size
+        hidden_states = hidden_states.view(batch_size, channel, height, self.patch_size, width, self.patch_size)
+        hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5).contiguous()
+        hidden_states = hidden_states.view(batch_size, height * width, channel * self.patch_size * self.patch_size)
+
+        # Project the patches
+        hidden_states = self.proj(hidden_states)
+        encoder_hidden_states = self.text_proj(encoder_hidden_states)
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+
+        # Calculate text_length
+        text_length = encoder_hidden_states.shape[1]
+
+        image_pos_embed = self.pos_embed[:height, :width].reshape(height * width, -1)
+        text_pos_embed = torch.zeros(
+            (text_length, self.hidden_size), dtype=image_pos_embed.dtype, device=image_pos_embed.device
+        )
+        pos_embed = torch.cat([text_pos_embed, image_pos_embed], dim=0)[None, ...]
+
+        return (hidden_states + pos_embed).to(hidden_states.dtype)
+
+
 def get_3d_rotary_pos_embed(
     embed_dim, crops_coords, grid_size, temporal_size, theta: int = 10000, use_real: bool = True
 ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
@@ -714,58 +768,6 @@ def forward(self, ids: torch.Tensor) -> torch.Tensor:
         return freqs_cos, freqs_sin
 
 
-class CogView3PlusPatchEmbed(nn.Module):
-    def __init__(
-        self,
-        in_channels: int = 16,
-        hidden_size: int = 2560,
-        patch_size: int = 2,
-        text_hidden_size: int = 4096,
-        pos_embed_max_size: int = 128,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.hidden_size = hidden_size
-        self.patch_size = patch_size
-        self.text_hidden_size = text_hidden_size
-        self.pos_embed_max_size = pos_embed_max_size
-        # Linear projection for image patches
-        self.proj = nn.Linear(in_channels * patch_size**2, hidden_size)
-
-        # Linear projection for text embeddings
-        self.text_proj = nn.Linear(text_hidden_size, hidden_size)
-
-        pos_embed = get_2d_sincos_pos_embed(hidden_size, pos_embed_max_size, base_size=pos_embed_max_size)
-        pos_embed = pos_embed.reshape(pos_embed_max_size, pos_embed_max_size, hidden_size)
-        self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float(), persistent=False)
-
-    def forward(self, hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor = None) -> torch.Tensor:
-        batch_size, channel, height, width = hidden_states.shape
-        if height % self.patch_size != 0 or width % self.patch_size != 0:
-            raise ValueError("Height and width must be divisible by patch size")
-        height = height // self.patch_size
-        width = width // self.patch_size
-        hidden_states = hidden_states.view(batch_size, channel, height, self.patch_size, width, self.patch_size)
-        hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5).contiguous()
-        hidden_states = hidden_states.view(batch_size, height * width, channel * self.patch_size * self.patch_size)
-
-        # Project the patches
-        hidden_states = self.proj(hidden_states)
-        encoder_hidden_states = self.text_proj(encoder_hidden_states)
-        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
-
-        # Calculate text_length
-        text_length = encoder_hidden_states.shape[1]
-
-        image_pos_embed = self.pos_embed[:height, :width].reshape(height * width, -1)
-        text_pos_embed = torch.zeros(
-            (text_length, self.hidden_size), dtype=image_pos_embed.dtype, device=image_pos_embed.device
-        )
-        pos_embed = torch.cat([text_pos_embed, image_pos_embed], dim=0)[None, ...]
-
-        return (hidden_states + pos_embed).to(hidden_states.dtype)
-
-
 class TimestepEmbedding(nn.Module):
     def __init__(
         self,
@@ -1090,11 +1092,11 @@ def forward(self, timestep, class_labels, hidden_dtype=None):
 
 
 class CombinedTimestepTextProjEmbeddings(nn.Module):
-    def __init__(self, embedding_dim, pooled_projection_dim, timesteps_dim=256):
+    def __init__(self, embedding_dim, pooled_projection_dim):
         super().__init__()
 
-        self.time_proj = Timesteps(num_channels=timesteps_dim, flip_sin_to_cos=True, downscale_freq_shift=0)
-        self.timestep_embedder = TimestepEmbedding(in_channels=timesteps_dim, time_embed_dim=embedding_dim)
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
         self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
 
     def forward(self, timestep, pooled_projection):
@@ -1132,7 +1134,7 @@ def forward(self, timestep, guidance, pooled_projection):
         return conditioning
 
 
-class CogView3CombinedTimestepConditionEmbeddings(nn.Module):
+class CogView3CombinedTimestepSizeEmbeddings(nn.Module):
     def __init__(self, embedding_dim: int, condition_dim: int, pooled_projection_dim: int, timesteps_dim: int = 256):
         super().__init__()
 
@@ -1154,9 +1156,11 @@ def forward(
         original_size_proj = self.condition_proj(original_size.flatten()).view(original_size.size(0), -1)
         crop_coords_proj = self.condition_proj(crop_coords.flatten()).view(crop_coords.size(0), -1)
         target_size_proj = self.condition_proj(target_size.flatten()).view(target_size.size(0), -1)
+        
+        # (B, 3 * condition_dim)
         condition_proj = torch.cat(
             [original_size_proj, crop_coords_proj, target_size_proj], dim=1
-        )  # (B, 3 * condition_dim)
+        )
 
         timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (B, embedding_dim)
         condition_emb = self.condition_embedder(condition_proj.to(dtype=hidden_dtype))  # (B, embedding_dim)
diff --git a/src/diffusers/models/transformers/transformer_cogview3plus.py b/src/diffusers/models/transformers/transformer_cogview3plus.py
@@ -29,7 +29,7 @@
 from ...models.modeling_utils import ModelMixin
 from ...models.normalization import AdaLayerNormContinuous
 from ...utils import is_torch_version, logging
-from ..embeddings import CogView3CombinedTimestepConditionEmbeddings, CogView3PlusPatchEmbed
+from ..embeddings import CogView3CombinedTimestepSizeEmbeddings, CogView3PlusPatchEmbed
 from ..modeling_outputs import Transformer2DModelOutput
 from ..normalization import CogView3PlusAdaLayerNormZeroTextImage
 
@@ -133,12 +133,27 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
             The number of channels in each head.
         num_attention_heads (`int`, defaults to `64`):
             The number of heads to use for multi-head attention.
-        out_channels (`int`, *optional*, defaults to `16`):
+        out_channels (`int`, defaults to `16`):
             The number of channels in the output.
         text_embed_dim (`int`, defaults to `4096`):
             Input dimension of text embeddings from the text encoder.
         time_embed_dim (`int`, defaults to `512`):
             Output dimension of timestep embeddings.
+        condition_dim (`int`, defaults to `256`):
+            The embedding dimension of the input SDXL-style resolution conditions (original_size, target_size, crop_coords).
+        pooled_projection_dim (`int`, defaults to `1536`):
+            The overall pooled dimension by concatenating SDXL-style resolution conditions. As 3 additional conditions are
+            used (original_size, target_size, crop_coords), and each is a sinusoidal condition of dimension `2 * condition_dim`,
+            we get the pooled projection dimension as `2 * condition_dim * 3 => 1536`. The timestep embeddings will be projected
+            to this dimension as well.
+            TODO(yiyi): Do we need this parameter based on the above explanation?
+        pos_embed_max_size (`int`, defaults to `128`):
+            The maximum resolution of the positional embeddings, from which slices of shape `H x W` are taken and added to input
+            patched latents, where `H` and `W` are the latent height and width respectively. A value of 128 means that the maximum
+            supported height and width for image generation is `128 * vae_scale_factor * patch_size => 128 * 8 * 2 => 2048`.
+        sample_size (`int`, defaults to `128`):
+            The base resolution of input latents. If height/width is not provided during generation, this value is used to determine
+            the resolution as `sample_size * vae_scale_factor => 128 * 8 => 1024`
     """
 
     _supports_gradient_checkpointing = True
@@ -163,15 +178,15 @@ def __init__(
         self.out_channels = out_channels
         self.inner_dim = num_attention_heads * attention_head_dim
 
-        self.pos_embed = CogView3PlusPatchEmbed(
+        self.patch_embed = CogView3PlusPatchEmbed(
             in_channels=in_channels,
             hidden_size=self.inner_dim,
             patch_size=patch_size,
             text_hidden_size=text_embed_dim,
             pos_embed_max_size=pos_embed_max_size,
         )
 
-        self.time_condition_embed = CogView3CombinedTimestepConditionEmbeddings(
+        self.time_condition_embed = CogView3CombinedTimestepSizeEmbeddings(
             embedding_dim=time_embed_dim,
             condition_dim=condition_dim,
             pooled_projection_dim=pooled_projection_dim,
@@ -318,20 +333,31 @@ def forward(
         The [`CogView3PlusTransformer2DModel`] forward method.
 
         Args:
-            hidden_states (`torch.Tensor`): Input `hidden_states`.
-            timestep (`torch.LongTensor`): Indicates denoising step.
-            y (`torch.LongTensor`, *optional*): 标签输入，用于获取标签嵌入。
-            block_controlnet_hidden_states: (`list` of `torch.Tensor`): A list of tensors for residuals.
-            joint_attention_kwargs (`dict`, *optional*): Additional kwargs for the attention processor.
-            return_dict (`bool`, *optional*, defaults to `True`): Whether to return a `Transformer2DModelOutput`.
+            hidden_states (`torch.Tensor`):
+                Input `hidden_states` of shape `(batch size, channel, height, width)`.
+            encoder_hidden_states (`torch.Tensor`):
+                Conditional embeddings (embeddings computed from the input conditions such as prompts)
+                of shape `(batch_size, sequence_len, text_embed_dim)`
+            timestep (`torch.LongTensor`):
+                Used to indicate denoising step.
+            original_size (`torch.Tensor`):
+                CogView3 uses SDXL-like micro-conditioning for original image size as explained in section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`torch.Tensor`):
+                CogView3 uses SDXL-like micro-conditioning for target image size as explained in section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crop_coords (`torch.Tensor`):
+                CogView3 uses SDXL-like micro-conditioning for crop coordinates as explained in section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+                tuple.
 
         Returns:
-            Output tensor or `Transformer2DModelOutput`.
+            `torch.Tensor` or [`~models.transformer_2d.Transformer2DModelOutput`]:
+                The denoised latents using provided inputs as conditioning.
         """
         height, width = hidden_states.shape[-2:]
         text_seq_length = encoder_hidden_states.shape[1]
 
-        hidden_states = self.pos_embed(
+        hidden_states = self.patch_embed(
             hidden_states, encoder_hidden_states
         )  # takes care of adding positional embeddings too.
         emb = self.time_condition_embed(timestep, original_size, target_size, crop_coords, hidden_states.dtype)
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
@@ -146,7 +146,7 @@
         "CogVideoXVideoToVideoPipeline",
     ]
     _import_structure["cogview3"] = [
-        "CogView3PlusPipeline",
+        "CogView3PlusPipeline"
     ]
     _import_structure["controlnet"].extend(
         [
diff --git a/src/diffusers/pipelines/cogview3/pipeline_cogview3plus.py b/src/diffusers/pipelines/cogview3/pipeline_cogview3plus.py

Original file line number	Diff line number	Diff line change
`@@ -146,7 +146,7 @@`
`146`	`146`	`"CogVideoXVideoToVideoPipeline",`
`147`	`147`	`]`
`148`	`148`	`_import_structure["cogview3"] = [`
`149`		`- "CogView3PlusPipeline",`
	`149`	`+ "CogView3PlusPipeline"`
`150`	`150`	`]`
`151`	`151`	`_import_structure["controlnet"].extend(`
`152`	`152`	`[`