fix opensora init (#202)

oahzxl · HaiShuangFan · LiewFeng · web-flow · commit 256fb471188e · 2024-09-13T00:31:11.000+08:00
* update (#201) --------- Co-authored-by: HaishuangFan <936959542@qq.com> Co-authored-by: LiewFeng <1361871897@qq.com> Co-authored-by: Lexarymade <2916094007@qq.com>
diff --git a/videosys/models/transformers/open_sora_transformer_3d.py b/videosys/models/transformers/open_sora_transformer_3d.py
@@ -8,7 +8,6 @@
 # --------------------------------------------------------
 
 
-import os
 from collections.abc import Iterable
 from functools import partial
 
@@ -635,12 +634,3 @@ def unpatchify(self, x, N_t, N_h, N_w, R_t, R_h, R_w):
         # unpad
         x = x[:, :, :R_t, :R_h, :R_w]
         return x
-
-
-def STDiT3_XL_2(from_pretrained=None, **kwargs):
-    if from_pretrained is not None and not os.path.isdir(from_pretrained):
-        model = STDiT3.from_pretrained(from_pretrained, **kwargs)
-    else:
-        config = STDiT3Config(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
-        model = STDiT3(config)
-    return model
diff --git a/videosys/pipelines/open_sora/pipeline_open_sora.py b/videosys/pipelines/open_sora/pipeline_open_sora.py
@@ -12,7 +12,7 @@
 from videosys.core.pab_mgr import PABConfig, set_pab_manager
 from videosys.core.pipeline import VideoSysPipeline, VideoSysPipelineOutput
 from videosys.models.autoencoders.autoencoder_kl_open_sora import OpenSoraVAE_V1_2
-from videosys.models.transformers.open_sora_transformer_3d import STDiT3_XL_2
+from videosys.models.transformers.open_sora_transformer_3d import STDiT3
 from videosys.schedulers.scheduling_rflow_open_sora import RFLOW
 from videosys.utils.utils import save_video
 
@@ -175,10 +175,10 @@ class OpenSoraPipeline(VideoSysPipeline):
         tokenizer (`T5Tokenizer`):
             Tokenizer of class
             [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
-        transformer ([`Transformer2DModel`]):
-            A text conditioned `Transformer2DModel` to denoise the encoded image latents.
+        transformer ([`STDiT3`]):
+            A text conditioned `STDiT3` to denoise the encoded video latents.
         scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+            A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
     """
     bad_punct_regex = re.compile(
         r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
@@ -193,7 +193,7 @@ def __init__(
         text_encoder: Optional[T5EncoderModel] = None,
         tokenizer: Optional[AutoTokenizer] = None,
         vae: Optional[AutoencoderKL] = None,
-        transformer: Optional[STDiT3_XL_2] = None,
+        transformer: Optional[STDiT3] = None,
         scheduler: Optional[RFLOW] = None,
         device: torch.device = torch.device("cuda"),
         dtype: torch.dtype = torch.bfloat16,
@@ -215,14 +215,9 @@ def __init__(
                 micro_batch_size=config.tiling_size,
             ).to(dtype)
         if transformer is None:
-            transformer = STDiT3_XL_2(
-                from_pretrained=config.transformer,
-                qk_norm=True,
-                enable_flash_attn=config.enable_flash_attn,
-                in_channels=vae.out_channels,
-                caption_channels=text_encoder.config.d_model,
-                model_max_length=300,
-            ).to(device, dtype)
+            transformer = STDiT3.from_pretrained(config.transformer, enable_flash_attn=config.enable_flash_attn).to(
+                dtype
+            )
         if scheduler is None:
             scheduler = RFLOW(
                 use_timestep_transform=True, num_sampling_steps=config.num_sampling_steps, cfg_scale=config.cfg_scale