add pipeline

a-r-r-o-w · a-r-r-o-w · commit a63e543a2411 · 2025-02-03T10:11:27.000+01:00
diff --git a/scripts/convert_cosmos_to_diffusers.py b/scripts/convert_cosmos_to_diffusers.py
@@ -0,0 +1,188 @@
+import argparse
+from typing import Any, Dict
+
+import torch
+from accelerate import init_empty_weights
+
+from diffusers import CosmosTransformer3DModel
+
+
+def remove_keys_(key: str, state_dict: Dict[str, Any]):
+    state_dict.pop(key)
+
+
+def update_state_dict_(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
+    state_dict[new_key] = state_dict.pop(old_key)
+
+
+def rename_transformer_blocks_(key: str, state_dict: Dict[str, Any]):
+    block_index = int(key.split(".")[1].removeprefix("block"))
+    new_key = key
+
+    old_prefix = f"blocks.block{block_index}"
+    new_prefix = f"transformer_blocks.{block_index}"
+    new_key = new_prefix + new_key.removeprefix(old_prefix)
+
+    state_dict[new_key] = state_dict.pop(key)
+
+
+TRANSFORMER_KEYS_RENAME_DICT = {
+    "t_embedder.1": "time_embed.t_embedder",
+    "affline_norm": "time_embed.norm",
+    ".blocks.0.block.attn": ".attn1",
+    ".blocks.1.block.attn": ".attn2",
+    ".blocks.2.block": ".ff",
+    ".blocks.0.adaLN_modulation.1": ".norm1.linear_1",
+    ".blocks.0.adaLN_modulation.2": ".norm1.linear_2",
+    ".blocks.1.adaLN_modulation.1": ".norm2.linear_1",
+    ".blocks.1.adaLN_modulation.2": ".norm2.linear_2",
+    ".blocks.2.adaLN_modulation.1": ".norm3.linear_1",
+    ".blocks.2.adaLN_modulation.2": ".norm3.linear_2",
+    "to_q.0": "to_q",
+    "to_q.1": "norm_q",
+    "to_k.0": "to_k",
+    "to_k.1": "norm_k",
+    "to_v.0": "to_v",
+    "layer1": "net.0.proj",
+    "layer2": "net.2",
+    "proj.1": "proj",
+    "x_embedder": "patch_embed",
+    "extra_pos_embedder": "learnable_pos_embed",
+    "final_layer.adaLN_modulation.1": "norm_out.linear_1",
+    "final_layer.adaLN_modulation.2": "norm_out.linear_2",
+    "final_layer.linear": "proj_out",
+}
+
+TRANSFORMER_SPECIAL_KEYS_REMAP = {
+    "blocks.block": rename_transformer_blocks_,
+    "logvar.0.freqs": remove_keys_,
+    "logvar.0.phases": remove_keys_,
+    "logvar.1.weight": remove_keys_,
+    "pos_embedder.seq": remove_keys_,
+}
+
+VAE_KEYS_RENAME_DICT = {}
+
+VAE_SPECIAL_KEYS_REMAP = {}
+
+
+def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
+    state_dict = saved_dict
+    if "model" in saved_dict.keys():
+        state_dict = state_dict["model"]
+    if "module" in saved_dict.keys():
+        state_dict = state_dict["module"]
+    if "state_dict" in saved_dict.keys():
+        state_dict = state_dict["state_dict"]
+    return state_dict
+
+
+def convert_transformer(ckpt_path: str):
+    PREFIX_KEY = "net."
+    original_state_dict = get_state_dict(torch.load(ckpt_path, map_location="cpu", weights_only=True))
+
+    with init_empty_weights():
+        transformer = CosmosTransformer3DModel()
+
+    for key in list(original_state_dict.keys()):
+        new_key = key[:]
+        if new_key.startswith(PREFIX_KEY):
+            new_key = new_key.removeprefix(PREFIX_KEY)
+        for replace_key, rename_key in TRANSFORMER_KEYS_RENAME_DICT.items():
+            new_key = new_key.replace(replace_key, rename_key)
+        update_state_dict_(original_state_dict, key, new_key)
+
+    for key in list(original_state_dict.keys()):
+        for special_key, handler_fn_inplace in TRANSFORMER_SPECIAL_KEYS_REMAP.items():
+            if special_key not in key:
+                continue
+            handler_fn_inplace(key, original_state_dict)
+
+    transformer.load_state_dict(original_state_dict, strict=True, assign=True)
+    return transformer
+
+
+# def convert_vae(ckpt_path: str):
+#     original_state_dict = get_state_dict(torch.load(ckpt_path, map_location="cpu", weights_only=True))
+
+#     with init_empty_weights():
+#         vae = AutoencoderKLHunyuanVideo()
+
+#     for key in list(original_state_dict.keys()):
+#         new_key = key[:]
+#         for replace_key, rename_key in VAE_KEYS_RENAME_DICT.items():
+#             new_key = new_key.replace(replace_key, rename_key)
+#         update_state_dict_(original_state_dict, key, new_key)
+
+#     for key in list(original_state_dict.keys()):
+#         for special_key, handler_fn_inplace in VAE_SPECIAL_KEYS_REMAP.items():
+#             if special_key not in key:
+#                 continue
+#             handler_fn_inplace(key, original_state_dict)
+
+#     vae.load_state_dict(original_state_dict, strict=True, assign=True)
+#     return vae
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--transformer_ckpt_path", type=str, default=None, help="Path to original transformer checkpoint"
+    )
+    parser.add_argument("--vae_ckpt_path", type=str, default=None, help="Path to original VAE checkpoint")
+    parser.add_argument("--text_encoder_path", type=str, default=None, help="Path to original llama checkpoint")
+    parser.add_argument("--tokenizer_path", type=str, default=None, help="Path to original llama tokenizer")
+    parser.add_argument("--text_encoder_2_path", type=str, default=None, help="Path to original clip checkpoint")
+    parser.add_argument("--save_pipeline", action="store_true")
+    parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
+    parser.add_argument("--dtype", default="bf16", help="Torch dtype to save the transformer in.")
+    return parser.parse_args()
+
+
+DTYPE_MAPPING = {
+    "fp32": torch.float32,
+    "fp16": torch.float16,
+    "bf16": torch.bfloat16,
+}
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    transformer = None
+    dtype = DTYPE_MAPPING[args.dtype]
+
+    if args.save_pipeline:
+        assert args.transformer_ckpt_path is not None and args.vae_ckpt_path is not None
+        assert args.text_encoder_path is not None
+        assert args.tokenizer_path is not None
+        assert args.text_encoder_2_path is not None
+
+    if args.transformer_ckpt_path is not None:
+        transformer = convert_transformer(args.transformer_ckpt_path)
+        transformer = transformer.to(dtype=dtype)
+        if not args.save_pipeline:
+            transformer.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
+
+    # if args.vae_ckpt_path is not None:
+    #     vae = convert_vae(args.vae_ckpt_path)
+    #     if not args.save_pipeline:
+    #         vae.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
+
+    # if args.save_pipeline:
+    #     text_encoder = AutoModel.from_pretrained(args.text_encoder_path, torch_dtype=torch.float16)
+    #     tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path, padding_side="right")
+    #     text_encoder_2 = CLIPTextModel.from_pretrained(args.text_encoder_2_path, torch_dtype=torch.float16)
+    #     tokenizer_2 = CLIPTokenizer.from_pretrained(args.text_encoder_2_path)
+    #     scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
+
+    #     pipe = CosmosPipeline(
+    #         transformer=transformer,
+    #         vae=vae,
+    #         text_encoder=text_encoder,
+    #         tokenizer=tokenizer,
+    #         text_encoder_2=text_encoder_2,
+    #         tokenizer_2=tokenizer_2,
+    #         scheduler=scheduler,
+    #     )
+    #     pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -287,6 +287,7 @@
             "CogVideoXVideoToVideoPipeline",
             "CogView3PlusPipeline",
             "ConsisIDPipeline",
+            "CosmosPipeline",
             "CycleDiffusionPipeline",
             "FluxControlImg2ImgPipeline",
             "FluxControlInpaintPipeline",
@@ -781,6 +782,7 @@
             CogVideoXVideoToVideoPipeline,
             CogView3PlusPipeline,
             ConsisIDPipeline,
+            CosmosPipeline,
             CycleDiffusionPipeline,
             FluxControlImg2ImgPipeline,
             FluxControlInpaintPipeline,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
@@ -155,6 +155,7 @@
     ]
     _import_structure["cogview3"] = ["CogView3PlusPipeline"]
     _import_structure["consisid"] = ["ConsisIDPipeline"]
+    _import_structure["cosmos"] = ["CosmosPipeline"]
     _import_structure["controlnet"].extend(
         [
             "BlipDiffusionControlNetPipeline",
@@ -518,6 +519,7 @@
             StableDiffusionControlNetXSPipeline,
             StableDiffusionXLControlNetXSPipeline,
         )
+        from .cosmos import CosmosPipeline
         from .deepfloyd_if import (
             IFImg2ImgPipeline,
             IFImg2ImgSuperResolutionPipeline,
diff --git a/src/diffusers/pipelines/cosmos/__init__.py b/src/diffusers/pipelines/cosmos/__init__.py
@@ -0,0 +1,48 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_cosmos"] = ["CosmosPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_cosmos import CosmosPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos.py
diff --git a/src/diffusers/pipelines/cosmos/pipeline_output.py b/src/diffusers/pipelines/cosmos/pipeline_output.py

Original file line number	Diff line number	Diff line change
`@@ -155,6 +155,7 @@`
`155`	`155`	`]`
`156`	`156`	`_import_structure["cogview3"] = ["CogView3PlusPipeline"]`
`157`	`157`	`_import_structure["consisid"] = ["ConsisIDPipeline"]`
	`158`	`+ _import_structure["cosmos"] = ["CosmosPipeline"]`
`158`	`159`	`_import_structure["controlnet"].extend(`
`159`	`160`	`[`
`160`	`161`	`"BlipDiffusionControlNetPipeline",`
`@@ -518,6 +519,7 @@`
`518`	`519`	`StableDiffusionControlNetXSPipeline,`
`519`	`520`	`StableDiffusionXLControlNetXSPipeline,`
`520`	`521`	`)`
	`522`	`+ from .cosmos import CosmosPipeline`
`521`	`523`	`from .deepfloyd_if import (`
`522`	`524`	`IFImg2ImgPipeline,`
`523`	`525`	`IFImg2ImgSuperResolutionPipeline,`