hao-ai-lab
diff --git a/‎fastvideo/v1/distributed/parallel_state.py‎
Lines changed: 7 additions & 33 deletions b/‎fastvideo/v1/distributed/parallel_state.py‎
Lines changed: 7 additions & 33 deletions
diff --git a/‎fastvideo/v1/layers/layernorm.py‎
Lines changed: 1 addition & 7 deletions b/‎fastvideo/v1/layers/layernorm.py‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎fastvideo/v1/models/dits/wanvideo.py‎
Lines changed: 4 additions & 4 deletions b/‎fastvideo/v1/models/dits/wanvideo.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎fastvideo/v1/models/encoders/t5.py‎
Lines changed: 3 additions & 3 deletions b/‎fastvideo/v1/models/encoders/t5.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎fastvideo/v1/models/loader/component_loader.py‎
Lines changed: 14 additions & 41 deletions b/‎fastvideo/v1/models/loader/component_loader.py‎
Lines changed: 14 additions & 41 deletions
@@ -36,7 +36,6 @@
 
 import torch
 import torch.distributed
-import torch.distributed as dist
 from torch.distributed import Backend, ProcessGroup, ReduceOp
 
 import fastvideo.v1.envs as envs
@@ -693,19 +692,13 @@ def destroy(self) -> None:
 
 
 _WORLD: Optional[GroupCoordinator] = None
-_NODE: Optional[GroupCoordinator] = None
 
 
 def get_world_group() -> GroupCoordinator:
     assert _WORLD is not None, ("world group is not initialized")
     return _WORLD
 
 
-def get_node_group() -> GroupCoordinator:
-    assert _NODE is not None, ("node group is not initialized")
-    return _NODE
-
-
 def init_world_group(ranks: List[int], local_rank: int,
                      backend: str) -> GroupCoordinator:
     return GroupCoordinator(
@@ -717,18 +710,6 @@ def init_world_group(ranks: List[int], local_rank: int,
     )
 
 
-def init_node_group(local_rank: int, backend: str):
-    cpu_group = get_world_group().cpu_group
-    node_ranks = same_node_ranks(cpu_group)
-    node_size = len(node_ranks)
-    all_node_ranks = [
-        list(range(i * node_size, (i + 1) * node_size))
-        for i in range(dist.get_world_size() // node_size)
-    ]
-    global _NODE
-    _NODE = init_model_parallel_group(all_node_ranks, local_rank, backend)
-
-
 def init_model_parallel_group(
     group_ranks: List[List[int]],
     local_rank: int,
@@ -801,8 +782,6 @@ def init_distributed_environment(
     else:
         assert _WORLD.world_size == torch.distributed.get_world_size(), (
             "world group already initialized with a different world size")
-    # Init a group for each node
-    init_node_group(local_rank, backend)
 
 
 _SP: Optional[GroupCoordinator] = None
@@ -925,7 +904,7 @@ def get_dp_rank() -> int:
     return get_dp_group().rank_in_group
 
 
-def get_local_torch_device() -> torch.device:
+def get_torch_device() -> torch.device:
     """Return the torch device for the current rank."""
     return torch.device(f"cuda:{envs.LOCAL_RANK}")
 
@@ -1042,22 +1021,17 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
             "torch._C._host_emptyCache() only available in Pytorch >=2.5")
 
 
-def same_node_ranks(pg: Union[ProcessGroup, StatelessProcessGroup],
-                    source_rank: int = 0) -> List[int]:
+def in_the_same_node_as(pg: Union[ProcessGroup, StatelessProcessGroup],
+                        source_rank: int = 0) -> List[bool]:
     """
-    This is a collective operation that returns ranks that are in the same node
+    This is a collective operation that returns if each rank is in the same node
     as the source rank. It tests if processes are attached to the same
     memory system (shared access to shared memory).
-    Args:
-        pg: the global process group to test
-        source_rank: the rank to test against
-    Returns:
-        A list of ranks that are in the same node as the source rank.
     """
     if isinstance(pg, ProcessGroup):
         assert torch.distributed.get_backend(
             pg) != torch.distributed.Backend.NCCL, (
-                "same_node_ranks should be tested with a non-NCCL group.")
+                "in_the_same_node_as should be tested with a non-NCCL group.")
         # local rank inside the group
         rank = torch.distributed.get_rank(group=pg)
         world_size = torch.distributed.get_world_size(group=pg)
@@ -1129,7 +1103,7 @@ def same_node_ranks(pg: Union[ProcessGroup, StatelessProcessGroup],
             rank_data = pg.broadcast_obj(is_in_the_same_node, src=i)
             aggregated_data += rank_data
 
-    return [i for i, x in enumerate(aggregated_data.tolist()) if x == 1]
+    return [x == 1 for x in aggregated_data.tolist()]
 
 
 def initialize_tensor_parallel_group(
@@ -1258,4 +1232,4 @@ def initialize_sequence_parallel_group(
                                          backend,
                                          group_name=group_name)
 
-    return sp_group
+    return sp_group
@@ -6,7 +6,6 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.distributed.tensor import DTensor
 
 from fastvideo.v1.layers.custom_op import CustomOp
 
@@ -77,12 +76,7 @@ def forward_native(
         x = x * torch.rsqrt(variance + self.variance_epsilon)
         x = x.to(orig_dtype)
         if self.has_weight:
-            # TODO(wenxuan): When using CPU offload, FSDP has a bug that doesn't unwrap DTensor in final_layer_norm.
-            # Report this
-            if isinstance(self.weight, DTensor):
-                x = x * self.weight.to_local().to(x.device)
-            else:
-                x = x * self.weight
+            x = x * self.weight
         if residual is None:
             return x
         else:
 
@@ -318,9 +318,9 @@ def forward(
         value, _ = self.to_v(norm_hidden_states)
 
         if self.norm_q is not None:
-            query = self.norm_q.forward_native(query)
+            query = self.norm_q(query)
         if self.norm_k is not None:
-            key = self.norm_k.forward_native(key)
+            key = self.norm_k(key)
 
         query = query.squeeze(1).unflatten(2, (self.num_attention_heads, -1))
         key = key.squeeze(1).unflatten(2, (self.num_attention_heads, -1))
@@ -465,9 +465,9 @@ def forward(
         gate_compress, _ = self.to_gate_compress(norm_hidden_states)
 
         if self.norm_q is not None:
-            query = self.norm_q.forward_native(query)
+            query = self.norm_q(query)
         if self.norm_k is not None:
-            key = self.norm_k.forward_native(key)
+            key = self.norm_k(key)
 
         query = query.squeeze(1).unflatten(2, (self.num_attention_heads, -1))
         key = key.squeeze(1).unflatten(2, (self.num_attention_heads, -1))
 
@@ -124,7 +124,7 @@ def __init__(self,
         self.layer_norm = RMSNorm(config.d_model, eps=config.layer_norm_epsilon)
 
     def forward(self, hidden_states) -> torch.Tensor:
-        forwarded_states = self.layer_norm.forward_native(hidden_states)
+        forwarded_states = self.layer_norm(hidden_states)
         forwarded_states = self.DenseReluDense(forwarded_states)
         hidden_states = hidden_states + forwarded_states
         return hidden_states
@@ -362,7 +362,7 @@ def forward(
         attention_mask: torch.Tensor,
         attn_metadata: Optional[AttentionMetadata] = None,
     ) -> torch.Tensor:
-        normed_hidden_states = self.layer_norm.forward_native(hidden_states)
+        normed_hidden_states = self.layer_norm(hidden_states)
         attention_output = self.SelfAttention(
             hidden_states=normed_hidden_states,
             attention_mask=attention_mask,
@@ -391,7 +391,7 @@ def forward(
         hidden_states: torch.Tensor,
         attn_metadata: Optional[AttentionMetadata] = None,
     ) -> torch.Tensor:
-        normed_hidden_states = self.layer_norm.forward_native(hidden_states)
+        normed_hidden_states = self.layer_norm(hidden_states)
         attention_output = self.EncDecAttention(
             hidden_states=normed_hidden_states,
             attn_metadata=attn_metadata,
 
@@ -10,7 +10,6 @@
 from typing import Any, Generator, Iterable, List, Optional, Tuple, cast
 
 import torch
-import torch.distributed as dist
 import torch.nn as nn
 from safetensors.torch import load_file as safetensors_load_file
 from transformers import AutoImageProcessor, AutoTokenizer
@@ -21,9 +20,7 @@
 from fastvideo.v1.fastvideo_args import FastVideoArgs
 from fastvideo.v1.logger import init_logger
 from fastvideo.v1.models.hf_transformer_utils import get_diffusers_config
-from fastvideo.v1.models.loader.fsdp_load import (init_device_mesh,
-                                                  maybe_load_fsdp_model,
-                                                  shard_model)
+from fastvideo.v1.models.loader.fsdp_load import maybe_load_fsdp_model
 from fastvideo.v1.models.loader.utils import set_default_torch_dtype
 from fastvideo.v1.models.loader.weight_utils import (
     filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
@@ -166,19 +163,16 @@ def _prepare_weights(
         return hf_folder, hf_weights_files, use_safetensors
 
     def _get_weights_iterator(
-            self,
-            source: "Source",
-            to_cpu: bool = True
+            self, source: "Source"
     ) -> Generator[Tuple[str, torch.Tensor], None, None]:
         """Get an iterator for the model weights based on the load format."""
         hf_folder, hf_weights_files, use_safetensors = self._prepare_weights(
             source.model_or_path, source.fall_back_to_pt,
             source.allow_patterns_overrides)
         if use_safetensors:
-            weights_iterator = safetensors_weights_iterator(
-                hf_weights_files, to_cpu)
+            weights_iterator = safetensors_weights_iterator(hf_weights_files)
         else:
-            weights_iterator = pt_weights_iterator(hf_weights_files, to_cpu)
+            weights_iterator = pt_weights_iterator(hf_weights_files)
 
         if self.counter_before_loading_weights == 0.0:
             self.counter_before_loading_weights = time.perf_counter()
@@ -187,11 +181,10 @@ def _get_weights_iterator(
                 for (name, tensor) in weights_iterator)
 
     def _get_all_weights(
-            self,
-            model_config: Any,
-            model: nn.Module,
-            model_path: str,
-            to_cpu: bool = True
+        self,
+        model_config: Any,
+        model: nn.Module,
+        model_path: str,
     ) -> Generator[Tuple[str, torch.Tensor], None, None]:
         primary_weights = TextEncoderLoader.Source(
             model_path,
@@ -200,14 +193,14 @@ def _get_all_weights(
             allow_patterns_overrides=getattr(model, "allow_patterns_overrides",
                                              None),
         )
-        yield from self._get_weights_iterator(primary_weights, to_cpu)
+        yield from self._get_weights_iterator(primary_weights)
 
         secondary_weights = cast(
             Iterable[TextEncoderLoader.Source],
             getattr(model, "secondary_weights", ()),
         )
         for source in secondary_weights:
-            yield from self._get_weights_iterator(source, to_cpu)
+            yield from self._get_weights_iterator(source)
 
     def load(self, model_path: str, architecture: str,
              fastvideo_args: FastVideoArgs):
@@ -243,19 +236,13 @@ def load(self, model_path: str, architecture: str,
         target_device = get_local_torch_device()
         # TODO(will): add support for other dtypes
         return self.load_model(model_path, encoder_config, target_device,
-                               fastvideo_args, encoder_precision)
+                               encoder_precision)
 
     def load_model(self,
                    model_path: str,
                    model_config: EncoderConfig,
                    target_device: torch.device,
-                   fastvideo_args: FastVideoArgs,
                    dtype: str = "fp16"):
-        use_cpu_offload = fastvideo_args.text_encoder_offload and len(
-            getattr(model_config, "_fsdp_shard_conditions", [])) > 0
-
-        if fastvideo_args.text_encoder_offload:
-            target_device = torch.device("cpu")
         with set_default_torch_dtype(PRECISION_TO_TYPE[dtype]):
             with target_device:
                 architectures = getattr(model_config, "architectures", [])
@@ -264,26 +251,12 @@ def load_model(self,
 
             weights_to_load = {name for name, _ in model.named_parameters()}
             loaded_weights = model.load_weights(
-                self._get_all_weights(model_config, model, model_path,
-                                      use_cpu_offload))
+                self._get_all_weights(model_config, model, model_path))
             self.counter_after_loading_weights = time.perf_counter()
             logger.info(
                 "Loading weights took %.2f seconds",
                 self.counter_after_loading_weights -
                 self.counter_before_loading_weights)
-
-            if use_cpu_offload:
-                mesh = init_device_mesh(
-                    "cuda",
-                    mesh_shape=(1, dist.get_world_size()),
-                    mesh_dim_names=("offload", "replicate"),
-                )
-                shard_model(model,
-                            cpu_offload=True,
-                            reshard_after_forward=True,
-                            mesh=mesh["offload"],
-                            fsdp_shard_conditions=model._fsdp_shard_conditions,
-                            pin_cpu_memory=fastvideo_args.pin_cpu_memory)
             # We only enable strict check for non-quantized models
             # that have loaded weights tracking currently.
             # if loaded_weights is not None:
@@ -320,7 +293,7 @@ def load(self, model_path: str, architecture: str,
         target_device = get_local_torch_device()
         # TODO(will): add support for other dtypes
         return self.load_model(
-            model_path, encoder_config, target_device, fastvideo_args,
+            model_path, encoder_config, target_device,
             fastvideo_args.pipeline_config.image_encoder_precision)
 
 
@@ -567,4 +540,4 @@ def load_module(module_name: str, component_model_path: str,
                                                  transformers_or_diffusers)
 
         # Load the module
-        return loader.load(component_model_path, architecture, fastvideo_args)
+        return loader.load(component_model_path, architecture, fastvideo_args)