update

a-r-r-o-w · a-r-r-o-w · commit de925be59f58 · 2025-02-25T12:25:53.000+01:00
diff --git a/scripts/convert_cosmos_to_diffusers.py b/scripts/convert_cosmos_to_diffusers.py
@@ -1,11 +1,13 @@
 import argparse
+import pathlib
 from typing import Any, Dict
 
 import torch
 from accelerate import init_empty_weights
+from huggingface_hub import snapshot_download
 from transformers import T5EncoderModel, T5TokenizerFast
 
-from diffusers import CosmosTransformer3DModel, EDMEulerScheduler
+from diffusers import AutoencoderKLCosmos, CosmosTransformer3DModel, EDMEulerScheduler
 
 
 def remove_keys_(key: str, state_dict: Dict[str, Any]):
@@ -63,10 +65,81 @@ def rename_transformer_blocks_(key: str, state_dict: Dict[str, Any]):
 }
 
 VAE_KEYS_RENAME_DICT = {
-    "conv3d": "conv",
+    "down.0": "down_blocks.0",
+    "down.1": "down_blocks.1",
+    "down.2": "down_blocks.2",
+    "up.0": "up_blocks.2",
+    "up.1": "up_blocks.1",
+    "up.2": "up_blocks.0",
+    ".block.": ".resnets.",
+    "downsample": "downsamplers.0",
+    "upsample": "upsamplers.0",
+    "mid.block_1": "mid_block.resnets.0",
+    "mid.attn_1.0": "mid_block.attentions.0",
+    "mid.attn_1.1": "mid_block.temp_attentions.0",
+    "mid.block_2": "mid_block.resnets.1",
+    ".q.conv3d": ".to_q",
+    ".k.conv3d": ".to_k",
+    ".v.conv3d": ".to_v",
+    ".proj_out.conv3d": ".to_out.0",
+    ".0.conv3d": ".conv_s",
+    ".1.conv3d": ".conv_t",
+    "conv1.conv3d": "conv1",
+    "conv2.conv3d": "conv2",
+    "conv3.conv3d": "conv3",
+    "nin_shortcut.conv3d": "conv_shortcut",
+    "quant_conv.conv3d": "quant_conv",
+    "post_quant_conv.conv3d": "post_quant_conv",
 }
 
-VAE_SPECIAL_KEYS_REMAP = {}
+VAE_SPECIAL_KEYS_REMAP = {
+    "wavelets": remove_keys_,
+    "_arange": remove_keys_,
+    "patch_size_buffer": remove_keys_,
+}
+
+VAE_CONFIGS = {
+    "CV8x8x8-0.1": {
+        "name": "nvidia/Cosmos-0.1-Tokenizer-CV8x8x8",
+        "diffusers_config": {
+            "in_channels": 3,
+            "out_channels": 3,
+            "latent_channels": 16,
+            "encoder_block_out_channels": (128, 256, 512, 512),
+            "decode_block_out_channels": (256, 512, 512, 512),
+            "attention_resolutions": (32,),
+            "resolution": 1024,
+            "num_layers": 2,
+            "patch_size": 4,
+            "patch_type": "haar",
+            "scaling_factor": 1.0,
+            "spatial_compression_ratio": 8,
+            "temporal_compression_ratio": 8,
+            "latents_mean": None,
+            "latents_std": None,
+        },
+    },
+    "CV8x8x8-1.0": {
+        "name": "nvidia/Cosmos-1.0-Tokenizer-CV8x8x8",
+        "diffusers_config": {
+            "in_channels": 3,
+            "out_channels": 3,
+            "latent_channels": 16,
+            "encoder_block_out_channels": (128, 256, 512, 512),
+            "decode_block_out_channels": (256, 512, 512, 512),
+            "attention_resolutions": (32,),
+            "resolution": 1024,
+            "num_layers": 2,
+            "patch_size": 4,
+            "patch_type": "haar",
+            "scaling_factor": 1.0,
+            "spatial_compression_ratio": 8,
+            "temporal_compression_ratio": 8,
+            "latents_mean": None,
+            "latents_std": None,
+        },
+    },
+}
 
 
 def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
@@ -105,36 +178,53 @@ def convert_transformer(ckpt_path: str):
     return transformer
 
 
-# def convert_vae(ckpt_path: str):
-#     original_state_dict = get_state_dict(torch.load(ckpt_path, map_location="cpu", weights_only=True))
+def convert_vae(vae_type: str):
+    model_name = VAE_CONFIGS[vae_type]["name"]
+    snapshot_directory = snapshot_download(model_name, repo_type="model")
+    directory = pathlib.Path(snapshot_directory)
 
-#     with init_empty_weights():
-#         vae = AutoencoderKLHunyuanVideo()
+    autoencoder_file = directory / "autoencoder.jit"
+    mean_std_file = directory / "mean_std.pt"
 
-#     for key in list(original_state_dict.keys()):
-#         new_key = key[:]
-#         for replace_key, rename_key in VAE_KEYS_RENAME_DICT.items():
-#             new_key = new_key.replace(replace_key, rename_key)
-#         update_state_dict_(original_state_dict, key, new_key)
+    original_state_dict = torch.jit.load(autoencoder_file.as_posix()).state_dict()
+    if mean_std_file.exists():
+        mean_std = torch.load(mean_std_file, map_location="cpu", weights_only=True)
+    else:
+        mean_std = (None, None)
 
-#     for key in list(original_state_dict.keys()):
-#         for special_key, handler_fn_inplace in VAE_SPECIAL_KEYS_REMAP.items():
-#             if special_key not in key:
-#                 continue
-#             handler_fn_inplace(key, original_state_dict)
+    config = VAE_CONFIGS[vae_type]["diffusers_config"]
+    config.update(
+        {
+            "latents_mean": mean_std[0],
+            "latents_std": mean_std[1],
+        }
+    )
+    vae = AutoencoderKLCosmos(**config)
 
-#     vae.load_state_dict(original_state_dict, strict=True, assign=True)
-#     return vae
+    for key in list(original_state_dict.keys()):
+        new_key = key[:]
+        for replace_key, rename_key in VAE_KEYS_RENAME_DICT.items():
+            new_key = new_key.replace(replace_key, rename_key)
+        update_state_dict_(original_state_dict, key, new_key)
+
+    for key in list(original_state_dict.keys()):
+        for special_key, handler_fn_inplace in VAE_SPECIAL_KEYS_REMAP.items():
+            if special_key not in key:
+                continue
+            handler_fn_inplace(key, original_state_dict)
+
+    vae.load_state_dict(original_state_dict, strict=True, assign=True)
+    return vae
 
 
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--transformer_ckpt_path", type=str, default=None, help="Path to original transformer checkpoint"
     )
-    parser.add_argument("--vae_ckpt_path", type=str, default=None, help="Path to original VAE checkpoint")
-    parser.add_argument("--text_encoder_path", type=str, default=None, help="Path to original T5 checkpoint")
-    parser.add_argument("--tokenizer_path", type=str, default=None, help="Path to original T5 tokenizer")
+    parser.add_argument("--vae_type", type=str, default=None, choices=list(VAE_CONFIGS.keys()), help="Type of VAE")
+    parser.add_argument("--text_encoder_path", type=str, default=None, help="Path or HF id to original T5 checkpoint")
+    parser.add_argument("--tokenizer_path", type=str, default=None, help="Path or HF id to original T5 tokenizer")
     parser.add_argument("--save_pipeline", action="store_true")
     parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
     parser.add_argument("--dtype", default="bf16", help="Torch dtype to save the transformer in.")
@@ -155,7 +245,8 @@ def get_args():
     dtype = DTYPE_MAPPING[args.dtype]
 
     if args.save_pipeline:
-        assert args.transformer_ckpt_path is not None and args.vae_ckpt_path is not None
+        assert args.transformer_ckpt_path is not None
+        assert args.vae_type is not None
         assert args.text_encoder_path is not None
         assert args.tokenizer_path is not None
         assert args.text_encoder_2_path is not None
@@ -166,10 +257,10 @@ def get_args():
         if not args.save_pipeline:
             transformer.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
 
-    # if args.vae_ckpt_path is not None:
-    #     vae = convert_vae(args.vae_ckpt_path)
-    #     if not args.save_pipeline:
-    #         vae.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
+    if args.vae_type is not None:
+        vae = convert_vae(args.vae_type)
+        if not args.save_pipeline:
+            vae.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
 
     if args.save_pipeline:
         text_encoder = T5EncoderModel.from_pretrained(args.text_encoder_path, torch_dtype=dtype)
@@ -184,6 +275,7 @@ def get_args():
             num_train_timesteps=1000,
             prediction_type="epsilon",
             rho=7.0,
+            final_sigmas_type="sigma_min",
         )
 
     # if args.save_pipeline:
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_cosmos.py b/src/diffusers/models/autoencoders/autoencoder_kl_cosmos.py
@@ -853,6 +853,34 @@ class AutoencoderKLCosmos(ModelMixin, ConfigMixin):
             Number of output channels.
         latent_channels (`int`, defaults to `16`):
             Number of latent channels.
+        encoder_block_out_channels (`Tuple[int, ...]`, defaults to `(128, 256, 512, 512)`):
+            Number of output channels for each encoder down block.
+        decode_block_out_channels (`Tuple[int, ...]`, defaults to `(256, 512, 512, 512)`):
+            Number of output channels for each decoder up block.
+        attention_resolutions (`Tuple[int, ...]`, defaults to `(32,)`):
+            List of image/video resolutions at which to apply attention.
+        resolution (`int`, defaults to `1024`):
+            Base image/video resolution used for computing whether a block should have attention layers.
+        num_layers (`int`, defaults to `2`):
+            Number of resnet blocks in each encoder/decoder block.
+        patch_size (`int`, defaults to `4`):
+            Patch size used for patching the input image/video.
+        patch_type (`str`, defaults to `haar`):
+            Patch type used for patching the input image/video. Can be either `haar` or `rearrange`.
+        scaling_factor (`float`, defaults to `1.0`):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper. Not applicable in Cosmos,
+            but we default to 1.0 for consistency.
+        spatial_compression_ratio (`int`, defaults to `8`):
+            The spatial compression ratio to apply in the VAE. The number of downsample blocks is determined using
+            this.
+        temporal_compression_ratio (`int`, defaults to `8`):
+            The temporal compression ratio to apply in the VAE. The number of downsample blocks is determined using
+            this.
     """
 
     _supports_gradient_checkpointing = True
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos.py
@@ -595,16 +595,20 @@ def __call__(
         self._current_timestep = None
 
         if not output_type == "latent":
-            latents_mean, latents_std = self.vae.config.latents_mean, self.vae.config.latents_std
-            latents_mean = torch.tensor(latents_mean).view(1, self.vae.config.latent_channels, -1, 1, 1)[
-                :, :, : latents.size(2)
-            ]
-            latents_std = torch.tensor(latents_std).view(1, self.vae.config.latent_channels, -1, 1, 1)[
-                :, :, : latents.size(2)
-            ]
-            latents = (
-                latents * self.vae.config.latent_std / self.scheduler.config.sigma_data + self.vae.config.latent_mean
-            )
+            if self.vae.config.latents_mean is not None:
+                latents_mean, latents_std = self.vae.config.latents_mean, self.vae.config.latents_std
+                latents_mean = torch.tensor(latents_mean).view(1, self.vae.config.latent_channels, -1, 1, 1)[
+                    :, :, : latents.size(2)
+                ]
+                latents_std = torch.tensor(latents_std).view(1, self.vae.config.latent_channels, -1, 1, 1)[
+                    :, :, : latents.size(2)
+                ]
+                latents = (
+                    latents * self.vae.config.latent_std / self.scheduler.config.sigma_data
+                    + self.vae.config.latent_mean
+                )
+            else:
+                latents = latents / self.scheduler.config.sigma_data
             video = self.vae.decode(latents.to(self.vae.dtype), return_dict=False)[0]
             video = self.video_processor.postprocess_video(video, output_type=output_type)
         else:
diff --git a/tests/models/autoencoders/test_models_autoencoder_cosmos.py b/tests/models/autoencoders/test_models_autoencoder_cosmos.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2024 HuggingFace Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-# coding=utf-8`
`2`	`1`	`# Copyright 2024 HuggingFace Inc.`
`3`	`2`	`#`
`4`	`3`	`# Licensed under the Apache License, Version 2.0 (the "License");`