hao-ai-lab
diff --git a/‎.github/workflows/pr-test.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/pr-test.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/inference/basic/basic.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/inference/basic/basic.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎fastvideo/v1/configs/models/base.py‎
Lines changed: 2 additions & 4 deletions b/‎fastvideo/v1/configs/models/base.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎fastvideo/v1/configs/models/dits/stepvideo.py‎
Lines changed: 5 additions & 3 deletions b/‎fastvideo/v1/configs/models/dits/stepvideo.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎fastvideo/v1/configs/models/encoders/base.py‎
Lines changed: 1 addition & 4 deletions b/‎fastvideo/v1/configs/models/encoders/base.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎fastvideo/v1/configs/models/encoders/clip.py‎
Lines changed: 1 addition & 25 deletions b/‎fastvideo/v1/configs/models/encoders/clip.py‎
Lines changed: 1 addition & 25 deletions
diff --git a/‎fastvideo/v1/configs/models/encoders/llama.py‎
Lines changed: 1 addition & 25 deletions b/‎fastvideo/v1/configs/models/encoders/llama.py‎
Lines changed: 1 addition & 25 deletions
diff --git a/‎fastvideo/v1/configs/models/encoders/t5.py‎
Lines changed: 1 addition & 23 deletions b/‎fastvideo/v1/configs/models/encoders/t5.py‎
Lines changed: 1 addition & 23 deletions
diff --git a/‎fastvideo/v1/dataset/benchmarks/benchmark_parquet_dataset_iterable_style.py‎
Lines changed: 3 additions & 3 deletions b/‎fastvideo/v1/dataset/benchmarks/benchmark_parquet_dataset_iterable_style.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎fastvideo/v1/dataset/benchmarks/benchmark_parquet_dataset_map_style.py‎
Lines changed: 3 additions & 3 deletions b/‎fastvideo/v1/dataset/benchmarks/benchmark_parquet_dataset_map_style.py‎
Lines changed: 3 additions & 3 deletions
@@ -122,17 +122,17 @@ jobs:
             # Actual tests
             encoder-test:
               - 'fastvideo/v1/models/encoders/**'
-              - 'fastvideo/v1/models/loader/**'
+              - 'fastvideo/v1/models/loaders/**'
               - 'fastvideo/v1/tests/encoders/**'
               - *common-paths
             vae-test:
               - 'fastvideo/v1/models/vaes/**'
-              - 'fastvideo/v1/models/loader/**'
+              - 'fastvideo/v1/models/loaders/**'
               - 'fastvideo/v1/tests/vaes/**'
               - *common-paths
             transformer-test:
               - 'fastvideo/v1/models/dits/**'
-              - 'fastvideo/v1/models/loader/**'
+              - 'fastvideo/v1/models/loaders/**'
               - 'fastvideo/v1/tests/transformers/**'
               - 'fastvideo/v1/layers/**'
               - 'fastvideo/v1/attention/**'
 
@@ -10,7 +10,7 @@ def main():
     # attempt to identify the optimal arguments.
     generator = VideoGenerator.from_pretrained(
         "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
-        # FastVideo will automatically handle distributed setup
+        # if num_gpus > 1, FastVideo will automatically handle distributed setup
         num_gpus=2,
         use_fsdp_inference=True,
         use_cpu_offload=False
 
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 from dataclasses import dataclass, field, fields
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict
 
 from fastvideo.v1.logger import init_logger
 
@@ -12,9 +12,7 @@
 # 3. Any field in ArchConfig is fixed upon initialization, and should be hidden away from users
 @dataclass
 class ArchConfig:
-    stacked_params_mapping: List[Tuple[str, str, str]] = field(
-        default_factory=list
-    )  # mapping from huggingface weight names to custom names
+    pass
 
 
 @dataclass
 
@@ -5,11 +5,13 @@
 from fastvideo.v1.configs.models.dits.base import DiTArchConfig, DiTConfig
 
 
+def is_blocks(n: str, m) -> bool:
+    return "blocks" in n and str.isdigit(n.split(".")[-1])
+
+
 @dataclass
 class StepVideoArchConfig(DiTArchConfig):
-    _fsdp_shard_conditions: list = field(
-        default_factory=lambda:
-        [lambda n, m: "transformer_blocks" in n and n.split(".")[-1].isdigit()])
+    _fsdp_shard_conditions: list = field(default_factory=lambda: [is_blocks])
 
     _param_names_mapping: dict = field(
         default_factory=lambda: {
 
@@ -32,11 +32,8 @@ class TextEncoderArchConfig(EncoderArchConfig):
     output_past: bool = True
     scalable_attention: bool = True
     tie_word_embeddings: bool = False
-    stacked_params_mapping: List[Tuple[str, str, str]] = field(
-        default_factory=list
-    )  # mapping from huggingface weight names to custom names
+
     tokenizer_kwargs: Dict[str, Any] = field(default_factory=dict)
-    _fsdp_shard_conditions: list = field(default_factory=lambda: [])
 
     def __post_init__(self) -> None:
         self.tokenizer_kwargs = {
 
@@ -1,21 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 from dataclasses import dataclass, field
-from typing import List, Optional, Tuple
+from typing import Optional
 
 from fastvideo.v1.configs.models.encoders.base import (ImageEncoderArchConfig,
                                                        ImageEncoderConfig,
                                                        TextEncoderArchConfig,
                                                        TextEncoderConfig)
 
 
-def _is_transformer_layer(n: str, m) -> bool:
-    return "layers" in n and str.isdigit(n.split(".")[-1])
-
-
-def _is_embeddings(n: str, m) -> bool:
-    return n.endswith("embeddings")
-
-
 @dataclass
 class CLIPTextArchConfig(TextEncoderArchConfig):
     vocab_size: int = 49408
@@ -35,15 +27,6 @@ class CLIPTextArchConfig(TextEncoderArchConfig):
     bos_token_id: int = 49406
     eos_token_id: int = 49407
     text_len: int = 77
-    stacked_params_mapping: List[Tuple[str, str,
-                                       str]] = field(default_factory=lambda: [
-                                           # (param_name, shard_name, shard_id)
-                                           ("qkv_proj", "q_proj", "q"),
-                                           ("qkv_proj", "k_proj", "k"),
-                                           ("qkv_proj", "v_proj", "v"),
-                                       ])
-    _fsdp_shard_conditions: list = field(
-        default_factory=lambda: [_is_transformer_layer, _is_embeddings])
 
 
 @dataclass
@@ -62,13 +45,6 @@ class CLIPVisionArchConfig(ImageEncoderArchConfig):
     attention_dropout: float = 0.0
     initializer_range: float = 0.02
     initializer_factor: float = 1.0
-    stacked_params_mapping: List[Tuple[str, str,
-                                       str]] = field(default_factory=lambda: [
-                                           # (param_name, shard_name, shard_id)
-                                           ("qkv_proj", "q_proj", "q"),
-                                           ("qkv_proj", "k_proj", "k"),
-                                           ("qkv_proj", "v_proj", "v"),
-                                       ])
 
 
 @dataclass
 
@@ -1,23 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 from dataclasses import dataclass, field
-from typing import List, Optional, Tuple
+from typing import Optional
 
 from fastvideo.v1.configs.models.encoders.base import (TextEncoderArchConfig,
                                                        TextEncoderConfig)
 
 
-def _is_transformer_layer(n: str, m) -> bool:
-    return "layers" in n and str.isdigit(n.split(".")[-1])
-
-
-def _is_embeddings(n: str, m) -> bool:
-    return n.endswith("embed_tokens")
-
-
-def _is_final_norm(n: str, m) -> bool:
-    return n.endswith("norm")
-
-
 @dataclass
 class LlamaArchConfig(TextEncoderArchConfig):
     vocab_size: int = 32000
@@ -44,18 +32,6 @@ class LlamaArchConfig(TextEncoderArchConfig):
     head_dim: Optional[int] = None
     hidden_state_skip_layer: int = 2
     text_len: int = 256
-    stacked_params_mapping: List[Tuple[str, str, str]] = field(
-        default_factory=lambda: [
-            # (param_name, shard_name, shard_id)
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
-            (".gate_up_proj", ".gate_proj", 0),  # type: ignore
-            (".gate_up_proj", ".up_proj", 1),  # type: ignore
-        ])
-    _fsdp_shard_conditions: list = field(
-        default_factory=lambda:
-        [_is_transformer_layer, _is_embeddings, _is_final_norm])
 
 
 @dataclass
 
@@ -1,23 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 from dataclasses import dataclass, field
-from typing import List, Optional, Tuple
+from typing import Optional
 
 from fastvideo.v1.configs.models.encoders.base import (TextEncoderArchConfig,
                                                        TextEncoderConfig)
 
 
-def _is_transformer_layer(n: str, m) -> bool:
-    return "block" in n and str.isdigit(n.split(".")[-1])
-
-
-def _is_embeddings(n: str, m) -> bool:
-    return n.endswith("shared")
-
-
-def _is_final_layernorm(n: str, m) -> bool:
-    return n.endswith("final_layer_norm")
-
-
 @dataclass
 class T5ArchConfig(TextEncoderArchConfig):
     vocab_size: int = 32128
@@ -41,16 +29,6 @@ class T5ArchConfig(TextEncoderArchConfig):
     eos_token_id: int = 1
     classifier_dropout: float = 0.0
     text_len: int = 512
-    stacked_params_mapping: List[Tuple[str, str,
-                                       str]] = field(default_factory=lambda: [
-                                           # (param_name, shard_name, shard_id)
-                                           (".qkv_proj", ".q", "q"),
-                                           (".qkv_proj", ".k", "k"),
-                                           (".qkv_proj", ".v", "v"),
-                                       ])
-    _fsdp_shard_conditions: list = field(
-        default_factory=lambda:
-        [_is_transformer_layer, _is_embeddings, _is_final_layernorm])
 
     # Referenced from https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/configuration_t5.py
     def __post_init__(self):
 
@@ -11,7 +11,7 @@
     build_parquet_iterable_style_dataloader)
 from fastvideo.v1.distributed import get_world_rank
 from fastvideo.v1.distributed.parallel_state import (
-    cleanup_dist_env_and_memory, get_local_torch_device,
+    cleanup_dist_env_and_memory, get_torch_device,
     maybe_init_distributed_environment_and_model_parallel)
 from fastvideo.v1.logger import init_logger
 
@@ -148,8 +148,8 @@ def main() -> None:
                 break
 
             # Move data to device
-            latents = latents.to(get_local_torch_device())
-            embeddings = embeddings.to(get_local_torch_device())
+            latents = latents.to(get_torch_device())
+            embeddings = embeddings.to(get_torch_device())
 
             # Calculate actual batch size
             batch_size = latents.size(0)
 
@@ -13,7 +13,7 @@
     build_parquet_map_style_dataloader)
 from fastvideo.v1.distributed import get_world_rank
 from fastvideo.v1.distributed.parallel_state import (
-    cleanup_dist_env_and_memory, get_local_torch_device,
+    cleanup_dist_env_and_memory, get_torch_device,
     maybe_init_distributed_environment_and_model_parallel)
 from fastvideo.v1.logger import init_logger
 
@@ -165,8 +165,8 @@ def main() -> None:
                 break
 
             # Move data to device
-            latents = latents.to(get_local_torch_device())
-            embeddings = embeddings.to(get_local_torch_device())
+            latents = latents.to(get_torch_device())
+            embeddings = embeddings.to(get_torch_device())
 
             # Calculate actual batch size
             batch_size = latents.size(0)