hao-ai-lab
diff --git a/‎fastvideo/v1/attention/layer.py‎
Lines changed: 3 additions & 2 deletions b/‎fastvideo/v1/attention/layer.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎fastvideo/v1/dataset/parquet_datasets.py‎
Lines changed: 45 additions & 39 deletions b/‎fastvideo/v1/dataset/parquet_datasets.py‎
Lines changed: 45 additions & 39 deletions
diff --git a/‎fastvideo/v1/distributed/__init__.py‎
Lines changed: 8 additions & 2 deletions b/‎fastvideo/v1/distributed/__init__.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎fastvideo/v1/distributed/parallel_state.py‎
Lines changed: 45 additions & 3 deletions b/‎fastvideo/v1/distributed/parallel_state.py‎
Lines changed: 45 additions & 3 deletions
diff --git a/‎fastvideo/v1/fastvideo_args.py‎
Lines changed: 32 additions & 1 deletion b/‎fastvideo/v1/fastvideo_args.py‎
Lines changed: 32 additions & 1 deletion
diff --git a/‎fastvideo/v1/models/loader/component_loader.py‎
Lines changed: 11 additions & 2 deletions b/‎fastvideo/v1/models/loader/component_loader.py‎
Lines changed: 11 additions & 2 deletions
@@ -13,6 +13,7 @@
     get_sequence_model_parallel_rank, get_sequence_model_parallel_world_size)
 from fastvideo.v1.forward_context import ForwardContext, get_forward_context
 from fastvideo.v1.platforms import _Backend
+from fastvideo.v1.utils import get_compute_dtype
 
 
 class DistributedAttention(nn.Module):
@@ -38,7 +39,7 @@ def __init__(self,
         if num_kv_heads is None:
             num_kv_heads = num_heads
 
-        dtype = torch.get_default_dtype()
+        dtype = get_compute_dtype()
         attn_backend = get_attn_backend(
             head_size,
             dtype,
@@ -155,7 +156,7 @@ def __init__(self,
         if num_kv_heads is None:
             num_kv_heads = num_heads
 
-        dtype = torch.get_default_dtype()
+        dtype = get_compute_dtype()
         attn_backend = get_attn_backend(
             head_size,
             dtype,
 
@@ -15,7 +15,8 @@
 from torch.utils.data import Dataset
 from torchdata.stateful_dataloader import StatefulDataLoader
 
-from fastvideo.v1.distributed import (get_sequence_model_parallel_rank,
+from fastvideo.v1.distributed import (get_dp_group,
+                                      get_sequence_model_parallel_rank,
                                       get_sp_group)
 from fastvideo.v1.logger import init_logger
 
@@ -38,13 +39,18 @@ def __init__(self,
         self.batch_size = batch_size
         self.rank = rank
         self.local_rank = get_sequence_model_parallel_rank()
-        self.sp_world_size = world_size
+        self.sp_group = get_sp_group()
+        self.dp_group = get_dp_group()
+        self.dp_world_size = self.dp_group.world_size
+        self.sp_world_size = self.sp_group.world_size
         self.world_size = int(os.getenv("WORLD_SIZE", 1))
         self.cfg_rate = cfg_rate
         self.num_latent_t = num_latent_t
         self.local_indices = None
         self.plan_output_dir = os.path.join(
-            self.path, f"data_plan_{self.world_size}_{self.sp_world_size}.json")
+            self.path,
+            f"data_plan_{self.world_size}_{self.sp_world_size}_{self.dp_world_size}.json"
+        )
 
         ranks = get_sp_group().ranks
         group_ranks: List[List] = [[] for _ in range(self.world_size)]
@@ -55,40 +61,40 @@ def __init__(self,
             # This will be useful when resume training
             if os.path.exists(self.plan_output_dir):
                 print(f"Using existing plan from {self.plan_output_dir}")
-                dist.barrier()
-                return
-
-            # Find all parquet files recursively, and record num_rows for each file
-            print(f"Scanning for parquet files in {self.path}")
-            metadatas = []
-            for root, _, files in os.walk(self.path):
-                for file in sorted(files):
-                    if file.endswith('.parquet'):
-                        file_path = os.path.join(root, file)
-                        num_rows = pq.ParquetFile(file_path).metadata.num_rows
-                        for row_idx in range(num_rows):
-                            metadatas.append((file_path, row_idx))
-
-            # Generate the plan that distribute rows among workers
-            random.seed(seed)
-            random.shuffle(metadatas)
-
-            # Get all sp groups
-            # e.g. if num_gpus = 4, sp_size = 2
-            # group_ranks = [(0, 1), (2, 3)]
-            # We will assign the same batches of data to ranks in the same sp group, and we'll assign different batches to ranks in different sp groups
-            # e.g. plan = {0: [row 1, row 4], 1: [row 1, row 4], 2: [row 2, row 3], 3: [row 2, row 3]}
-            group_ranks_list: List[Any] = list(
-                set(tuple(r) for r in group_ranks))
-            num_sp_groups = len(group_ranks_list)
-            plan = defaultdict(list)
-            for idx, metadata in enumerate(metadatas):
-                sp_group_idx = idx % num_sp_groups
-                for global_rank in group_ranks_list[sp_group_idx]:
-                    plan[global_rank].append(metadata)
-
-            with open(self.plan_output_dir, "w") as f:
-                json.dump(plan, f)
+            else:
+                print(f"Creating new plan for {self.plan_output_dir}")
+                # Find all parquet files recursively, and record num_rows for each file
+                print(f"Scanning for parquet files in {self.path}")
+                metadatas = []
+                for root, _, files in os.walk(self.path):
+                    for file in sorted(files):
+                        if file.endswith('.parquet'):
+                            file_path = os.path.join(root, file)
+                            num_rows = pq.ParquetFile(
+                                file_path).metadata.num_rows
+                            for row_idx in range(num_rows):
+                                metadatas.append((file_path, row_idx))
+
+                # Generate the plan that distribute rows among workers
+                random.seed(seed)
+                random.shuffle(metadatas)
+
+                # Get all sp groups
+                # e.g. if num_gpus = 4, sp_size = 2
+                # group_ranks = [(0, 1), (2, 3)]
+                # We will assign the same batches of data to ranks in the same sp group, and we'll assign different batches to ranks in different sp groups
+                # e.g. plan = {0: [row 1, row 4], 1: [row 1, row 4], 2: [row 2, row 3], 3: [row 2, row 3]}
+                group_ranks_list: List[Any] = list(
+                    set(tuple(r) for r in group_ranks))
+                num_sp_groups = len(group_ranks_list)
+                plan = defaultdict(list)
+                for idx, metadata in enumerate(metadatas):
+                    sp_group_idx = idx % num_sp_groups
+                    for global_rank in group_ranks_list[sp_group_idx]:
+                        plan[global_rank].append(metadata)
+
+                with open(self.plan_output_dir, "w") as f:
+                    json.dump(plan, f)
         dist.barrier()
 
     def __len__(self):
@@ -121,9 +127,9 @@ def __getitem__(self, idx):
         cumulative = 0
         for i in range(parquet_file.num_row_groups):
             num_rows = parquet_file.metadata.row_group(i).num_rows
-            if cumulative + num_rows > idx:
+            if cumulative + num_rows > row_idx:
                 row_group_index = i
-                local_index = idx - cumulative
+                local_index = row_idx - cumulative
                 break
             cumulative += num_rows
 
 
@@ -2,8 +2,10 @@
 
 from fastvideo.v1.distributed.communication_op import *
 from fastvideo.v1.distributed.parallel_state import (
-    cleanup_dist_env_and_memory, get_sequence_model_parallel_rank,
-    get_sequence_model_parallel_world_size, get_tensor_model_parallel_rank,
+    cleanup_dist_env_and_memory, get_data_parallel_rank,
+    get_data_parallel_world_size, get_dp_group,
+    get_sequence_model_parallel_rank, get_sequence_model_parallel_world_size,
+    get_sp_group, get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size, get_world_group,
     init_distributed_environment, initialize_model_parallel,
     model_parallel_is_initialized)
@@ -12,11 +14,15 @@
 __all__ = [
     "init_distributed_environment",
     "initialize_model_parallel",
+    "get_data_parallel_world_size",
+    "get_data_parallel_rank",
     "get_sequence_model_parallel_rank",
     "get_sequence_model_parallel_world_size",
     "get_tensor_model_parallel_rank",
     "get_tensor_model_parallel_world_size",
     "cleanup_dist_env_and_memory",
     "get_world_group",
+    "get_dp_group",
+    "get_sp_group",
     "model_parallel_is_initialized",
 ]
@@ -704,7 +704,7 @@ def init_world_group(ranks: List[int], local_rank: int,
         group_ranks=[ranks],
         local_rank=local_rank,
         torch_distributed_backend=backend,
-        use_device_communicator=False,
+        use_device_communicator=True,
         group_name="world",
     )
 
@@ -794,9 +794,18 @@ def get_sp_group() -> GroupCoordinator:
     return _SP
 
 
+_DP: Optional[GroupCoordinator] = None
+
+
+def get_dp_group() -> GroupCoordinator:
+    assert _DP is not None, ("data parallel group is not initialized")
+    return _DP
+
+
 def initialize_model_parallel(
     tensor_model_parallel_size: int = 1,
     sequence_model_parallel_size: int = 1,
+    data_parallel_size: int = 1,
     backend: Optional[str] = None,
 ) -> None:
     """
@@ -852,6 +861,22 @@ def initialize_model_parallel(
                                     backend,
                                     group_name="sp")
 
+    # Build the data parallel groups.
+    num_data_parallel_groups: int = (world_size // data_parallel_size)
+    global _DP
+    assert _DP is None, ("data parallel group is already initialized")
+    group_ranks = []
+
+    for i in range(num_data_parallel_groups):
+        ranks = list(range(i * data_parallel_size,
+                           (i + 1) * data_parallel_size))
+        group_ranks.append(ranks)
+
+    _DP = init_model_parallel_group(group_ranks,
+                                    get_world_group().local_rank,
+                                    backend,
+                                    group_name="dp")
+
 
 def get_sequence_model_parallel_world_size() -> int:
     """Return world size for the sequence model parallel group."""
@@ -863,9 +888,20 @@ def get_sequence_model_parallel_rank() -> int:
     return get_sp_group().rank_in_group
 
 
+def get_data_parallel_world_size() -> int:
+    """Return world size for the data parallel group."""
+    return get_dp_group().world_size
+
+
+def get_data_parallel_rank() -> int:
+    """Return my rank for the data parallel group."""
+    return get_dp_group().rank_in_group
+
+
 def ensure_model_parallel_initialized(
     tensor_model_parallel_size: int,
     sequence_model_parallel_size: int,
+    data_parallel_size: int,
     backend: Optional[str] = None,
 ) -> None:
     """Helper to initialize model parallel groups if they are not initialized,
@@ -876,7 +912,8 @@ def ensure_model_parallel_initialized(
         get_world_group().device_group)
     if not model_parallel_is_initialized():
         initialize_model_parallel(tensor_model_parallel_size,
-                                  sequence_model_parallel_size, backend)
+                                  sequence_model_parallel_size,
+                                  data_parallel_size, backend)
         return
 
     assert (
@@ -895,7 +932,7 @@ def ensure_model_parallel_initialized(
 
 def model_parallel_is_initialized() -> bool:
     """Check if tensor, sequence parallel groups are initialized."""
-    return _TP is not None and _SP is not None
+    return _TP is not None and _SP is not None and _DP is not None
 
 
 _TP_STATE_PATCHED = False
@@ -948,6 +985,11 @@ def destroy_model_parallel() -> None:
         _SP.destroy()
     _SP = None
 
+    global _DP
+    if _DP:
+        _DP.destroy()
+    _DP = None
+
 
 def destroy_distributed_environment() -> None:
     global _WORLD
 
@@ -44,6 +44,8 @@ class FastVideoArgs:
     num_gpus: int = 1
     tp_size: Optional[int] = None
     sp_size: Optional[int] = None
+    dp_size: int = 1
+    dp_shards: Optional[int] = None
     dist_timeout: Optional[int] = None  # timeout for torch.distributed
 
     # Video generation parameters
@@ -70,7 +72,7 @@ class FastVideoArgs:
     # Text encoder configuration
     DEFAULT_TEXT_ENCODER_PRECISIONS = (
         "fp16",
-        "fp16",
+        # "fp16",
     )
     text_encoder_precisions: Tuple[str, ...] = field(
         default_factory=lambda: FastVideoArgs.DEFAULT_TEXT_ENCODER_PRECISIONS)
@@ -179,6 +181,20 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=FastVideoArgs.sp_size,
             help="The sequence parallelism size.",
         )
+        parser.add_argument(
+            "--data-parallel-size",
+            "--dp-size",
+            type=int,
+            default=FastVideoArgs.dp_size,
+            help="The data parallelism size.",
+        )
+        parser.add_argument(
+            "--data-parallel-shards",
+            "--dp-shards",
+            type=int,
+            default=FastVideoArgs.dp_shards,
+            help="The data parallelism shards.",
+        )
         parser.add_argument(
             "--dist-timeout",
             type=int,
@@ -332,6 +348,10 @@ def from_cli_args(cls, args: argparse.Namespace) -> "FastVideoArgs":
                 kwargs[attr] = args.tensor_parallel_size
             elif attr == 'sp_size' and hasattr(args, 'sequence_parallel_size'):
                 kwargs[attr] = args.sequence_parallel_size
+            elif attr == 'dp_size' and hasattr(args, 'data_parallel_size'):
+                kwargs[attr] = args.data_parallel_size
+            elif attr == 'dp_shards' and hasattr(args, 'data_parallel_shards'):
+                kwargs[attr] = args.data_parallel_shards
             elif attr == 'flow_shift' and hasattr(args, 'shift'):
                 kwargs[attr] = args.shift
             # Use getattr with default value from the dataclass for potentially missing attributes
@@ -343,10 +363,17 @@ def from_cli_args(cls, args: argparse.Namespace) -> "FastVideoArgs":
 
     def check_fastvideo_args(self) -> None:
         """Validate inference arguments for consistency"""
+        if not self.inference_mode:
+            assert self.dp_size is not None, "dp_size must be set for training"
+            assert self.dp_shards is not None, "dp_shards must be set for training"
+            assert self.sp_size is not None, "sp_size must be set for training"
+
         if self.tp_size is None:
             self.tp_size = self.num_gpus
         if self.sp_size is None:
             self.sp_size = self.num_gpus
+        if self.dp_shards is None:
+            self.dp_shards = self.num_gpus
 
         if self.num_gpus < max(self.tp_size, self.sp_size):
             self.num_gpus = max(self.tp_size, self.sp_size)
@@ -535,6 +562,10 @@ def from_cli_args(cls, args: argparse.Namespace) -> "TrainingArgs":
                 kwargs[attr] = args.sequence_parallel_size
             elif attr == 'flow_shift' and hasattr(args, 'shift'):
                 kwargs[attr] = args.shift
+            elif attr == 'dp_size' and hasattr(args, 'data_parallel_size'):
+                kwargs[attr] = args.data_parallel_size
+            elif attr == 'dp_shards' and hasattr(args, 'data_parallel_shards'):
+                kwargs[attr] = args.data_parallel_shards
             # Use getattr with default value from the dataclass for potentially missing attributes
             else:
                 default_value = getattr(cls, attr, None)
 
@@ -15,7 +15,7 @@
 from transformers import AutoImageProcessor, AutoTokenizer
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
-from fastvideo.v1.fastvideo_args import FastVideoArgs
+from fastvideo.v1.fastvideo_args import FastVideoArgs, TrainingArgs
 from fastvideo.v1.logger import init_logger
 from fastvideo.v1.models.hf_transformer_utils import get_diffusers_config
 from fastvideo.v1.models.loader.fsdp_load import load_fsdp_model
@@ -391,11 +391,18 @@ def load(self, model_path: str, architecture: str,
                     len(safetensors_list), model_path)
 
         # initialize_sequence_parallel_group(fastvideo_args.sp_size)
-        default_dtype = PRECISION_TO_TYPE[fastvideo_args.precision]
+        if fastvideo_args.training_mode:
+            assert isinstance(
+                fastvideo_args, TrainingArgs
+            ), "fastvideo_args must be a TrainingArgs object when training_mode is True"
+            default_dtype = PRECISION_TO_TYPE[fastvideo_args.master_weight_type]
+        else:
+            default_dtype = PRECISION_TO_TYPE[fastvideo_args.precision]
 
         # Load the model using FSDP loader
         logger.info("Loading model from %s, default_dtype: %s", cls_name,
                     default_dtype)
+        assert fastvideo_args.dp_shards is not None
         model = load_fsdp_model(
             model_cls=model_cls,
             init_params={
@@ -404,6 +411,8 @@ def load(self, model_path: str, architecture: str,
             },
             weight_dir_list=safetensors_list,
             device=fastvideo_args.device,
+            data_parallel_size=fastvideo_args.dp_size,
+            data_parallel_shards=fastvideo_args.dp_shards,
             cpu_offload=fastvideo_args.use_cpu_offload,
             default_dtype=default_dtype,
             # TODO(will): make these configurable