hao-ai-lab
diff --git a/‎fastvideo/v1/attention/layer.py‎
Lines changed: 4 additions & 4 deletions b/‎fastvideo/v1/attention/layer.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎fastvideo/v1/dataset/parquet_datasets.py‎
Lines changed: 26 additions & 29 deletions b/‎fastvideo/v1/dataset/parquet_datasets.py‎
Lines changed: 26 additions & 29 deletions
diff --git a/‎fastvideo/v1/distributed/__init__.py‎
Lines changed: 23 additions & 13 deletions b/‎fastvideo/v1/distributed/__init__.py‎
Lines changed: 23 additions & 13 deletions
diff --git a/‎fastvideo/v1/distributed/parallel_state.py‎
Lines changed: 19 additions & 13 deletions b/‎fastvideo/v1/distributed/parallel_state.py‎
Lines changed: 19 additions & 13 deletions
diff --git a/‎fastvideo/v1/fastvideo_args.py‎
Lines changed: 1 addition & 1 deletion b/‎fastvideo/v1/fastvideo_args.py‎
Lines changed: 1 addition & 1 deletion
@@ -9,8 +9,8 @@
                                              get_attn_backend)
 from fastvideo.v1.distributed.communication_op import (
     sequence_model_parallel_all_gather, sequence_model_parallel_all_to_all_4D)
-from fastvideo.v1.distributed.parallel_state import (
-    get_sequence_model_parallel_rank, get_sequence_model_parallel_world_size)
+from fastvideo.v1.distributed.parallel_state import (get_sp_parallel_rank,
+                                                     get_sp_world_size)
 from fastvideo.v1.forward_context import ForwardContext, get_forward_context
 from fastvideo.v1.platforms import _Backend
 from fastvideo.v1.utils import get_compute_dtype
@@ -86,8 +86,8 @@ def forward(
         assert q.dim() == 4 and k.dim() == 4 and v.dim(
         ) == 4, "Expected 4D tensors"
         batch_size, seq_len, num_heads, head_dim = q.shape
-        local_rank = get_sequence_model_parallel_rank()
-        world_size = get_sequence_model_parallel_world_size()
+        local_rank = get_sp_parallel_rank()
+        world_size = get_sp_world_size()
 
         forward_context: ForwardContext = get_forward_context()
         ctx_attn_metadata = forward_context.attn_metadata
 
@@ -15,9 +15,9 @@
 from torch.utils.data import Dataset
 from torchdata.stateful_dataloader import StatefulDataLoader
 
-from fastvideo.v1.distributed import (get_dp_group,
-                                      get_sequence_model_parallel_rank,
-                                      get_sp_group)
+from fastvideo.v1.distributed import (get_sp_group, get_sp_parallel_rank,
+                                      get_sp_world_size, get_world_rank,
+                                      get_world_size)
 from fastvideo.v1.logger import init_logger
 
 logger = init_logger(__name__)
@@ -28,23 +28,19 @@ class ParquetVideoTextDataset(Dataset):
 
     def __init__(self,
                  path: str,
-                 batch_size: int = 1024,
-                 rank: int = 0,
-                 world_size: int = 1,
+                 batch_size,
                  cfg_rate: float = 0.0,
                  num_latent_t: int = 2,
                  seed: int = 0,
                  validation: bool = False):
         super().__init__()
         self.path = str(path)
         self.batch_size = batch_size
-        self.rank = rank
-        self.local_rank = get_sequence_model_parallel_rank()
+        self.global_rank = get_world_rank()
+        self.rank_in_sp_group = get_sp_parallel_rank()
         self.sp_group = get_sp_group()
-        self.dp_group = get_dp_group()
-        self.dp_world_size = self.dp_group.world_size
-        self.sp_world_size = self.sp_group.world_size
-        self.world_size = int(os.getenv("WORLD_SIZE", 1))
+        self.sp_world_size = get_sp_world_size()
+        self.world_size = get_world_size()
         self.cfg_rate = cfg_rate
         self.num_latent_t = num_latent_t
         self.local_indices = None
@@ -56,22 +52,26 @@ def __init__(self,
 
         self.plan_output_dir = os.path.join(
             self.path,
-            f"data_plan_{self.world_size}_{self.sp_world_size}_{self.dp_world_size}.json"
+            f"data_plan_world_size_{self.world_size}_sp_size_{self.sp_world_size}.json"
         )
 
-        ranks = get_sp_group().ranks
+        # group_ranks: a list of lists
+        # len(group_ranks) = self.world_size
+        # len(group_ranks[i]) = self.sp_world_size
+        # group_ranks[i] represents the ranks of the SP group for the i-th GPU
+        # For example, if self.world_size = 4, self.sp_world_size = 2, then
+        # group_ranks = [[0, 1], [0, 1], [2, 3], [2, 3]]
+        sp_group_ranks = get_sp_group().ranks
         group_ranks: List[List] = [[] for _ in range(self.world_size)]
-        torch.distributed.all_gather_object(group_ranks, ranks)
+        dist.all_gather_object(group_ranks, sp_group_ranks)
 
-        if rank == 0:
+        if self.global_rank == 0:
             # If a plan already exists, then skip creating a new plan
             # This will be useful when resume training
             if os.path.exists(self.plan_output_dir):
-                print(f"Using existing plan from {self.plan_output_dir}")
+                logger.info("Using existing plan from %s", self.plan_output_dir)
             else:
-                print(f"Creating new plan for {self.plan_output_dir}")
-                # Find all parquet files recursively, and record num_rows for each file
-                print(f"Scanning for parquet files in {self.path}")
+                logger.info("Creating new plan for %s", self.plan_output_dir)
                 metadatas = []
                 for root, _, files in os.walk(self.path):
                     for file in sorted(files):
@@ -94,7 +94,7 @@ def __init__(self,
 
                 # Get all sp groups
                 # e.g. if num_gpus = 4, sp_size = 2
-                # group_ranks = [(0, 1), (2, 3)]
+                # group_ranks = [(0, 1), (0, 1), (2, 3), (2, 3)]
                 # We will assign the same batches of data to ranks in the same sp group, and we'll assign different batches to ranks in different sp groups
                 # e.g. plan = {0: [row 1, row 4], 1: [row 1, row 4], 2: [row 2, row 3], 3: [row 2, row 3]}
                 group_ranks_list: List[Any] = list(
@@ -113,7 +113,6 @@ def __init__(self,
                     json.dump(plan, f)
         else:
             pass
-
         dist.barrier()
         if validation:
             with open(self.plan_output_dir) as f:
@@ -168,7 +167,7 @@ def get_validation_negative_prompt(
 
         if self.cached_neg_prompt is None:
             raise RuntimeError(
-                f"Rank {self.rank} (SP rank {self.local_rank}): Could not retrieve negative prompt data"
+                f"Rank {self.global_rank} (SP rank {self.rank_in_sp_group}): Could not retrieve negative prompt data"
             )
 
         # Extract the components
@@ -186,15 +185,15 @@ def get_validation_negative_prompt(
                 lat = rearrange(lat,
                                 "t (n s) h w -> t n s h w",
                                 n=self.sp_world_size).contiguous()
-                lat = lat[:, self.local_rank, :, :, :]
+                lat = lat[:, self.rank_in_sp_group, :, :, :]
             return lat, emb, mask, info
 
     def __len__(self):
         if self.local_indices is None:
             try:
                 with open(self.plan_output_dir) as f:
                     plan = json.load(f)
-                self.local_indices = plan[str(self.rank)]
+                self.local_indices = plan[str(self.global_rank)]
             except Exception as err:
                 raise Exception(
                     "The data plan hasn't been created yet") from err
@@ -206,7 +205,7 @@ def __getitem__(self, idx):
             try:
                 with open(self.plan_output_dir) as f:
                     plan = json.load(f)
-                self.local_indices = plan[self.rank]
+                self.local_indices = plan[self.global_rank]
             except Exception as err:
                 raise Exception(
                     "The data plan hasn't been created yet") from err
@@ -240,7 +239,7 @@ def __getitem__(self, idx):
                 lat = rearrange(lat,
                                 "t (n s) h w -> t n s h w",
                                 n=self.sp_world_size).contiguous()
-                lat = lat[:, self.local_rank, :, :, :]
+                lat = lat[:, self.rank_in_sp_group, :, :, :]
             return lat, emb, mask, info
 
     def _process_row(self, row) -> Dict[str, Any]:
@@ -356,8 +355,6 @@ def _process_row(self, row) -> Dict[str, Any]:
     dataset = ParquetVideoTextDataset(
         args.path,
         batch_size=args.batch_size,
-        rank=rank,
-        world_size=world_size,
     )
 
     # Create DataLoader with proper settings
 
@@ -2,27 +2,37 @@
 
 from fastvideo.v1.distributed.communication_op import *
 from fastvideo.v1.distributed.parallel_state import (
-    cleanup_dist_env_and_memory, get_data_parallel_rank,
-    get_data_parallel_world_size, get_dp_group,
-    get_sequence_model_parallel_rank, get_sequence_model_parallel_world_size,
-    get_sp_group, get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size, get_world_group,
-    init_distributed_environment, initialize_model_parallel,
+    cleanup_dist_env_and_memory, get_dp_group, get_dp_rank, get_dp_world_size,
+    get_sp_group, get_sp_parallel_rank, get_sp_world_size, get_tp_group,
+    get_tp_rank, get_tp_world_size, get_world_group, get_world_rank,
+    get_world_size, init_distributed_environment, initialize_model_parallel,
     model_parallel_is_initialized)
 from fastvideo.v1.distributed.utils import *
 
 __all__ = [
+    # Initialization
     "init_distributed_environment",
     "initialize_model_parallel",
-    "get_data_parallel_world_size",
-    "get_data_parallel_rank",
-    "get_sequence_model_parallel_rank",
-    "get_sequence_model_parallel_world_size",
-    "get_tensor_model_parallel_rank",
-    "get_tensor_model_parallel_world_size",
     "cleanup_dist_env_and_memory",
+    "model_parallel_is_initialized",
+
+    # World group
     "get_world_group",
+    "get_world_rank",
+    "get_world_size",
+
+    # Data parallel group
     "get_dp_group",
+    "get_dp_rank",
+    "get_dp_world_size",
+
+    # Sequence parallel group
     "get_sp_group",
-    "model_parallel_is_initialized",
+    "get_sp_parallel_rank",
+    "get_sp_world_size",
+
+    # Tensor parallel group
+    "get_tp_group",
+    "get_tp_rank",
+    "get_tp_world_size",
 ]
@@ -735,9 +735,6 @@ def get_tp_group() -> GroupCoordinator:
     return _TP
 
 
-# kept for backward compatibility
-get_tensor_model_parallel_group = get_tp_group
-
 _ENABLE_CUSTOM_ALL_REDUCE = True
 
 
@@ -878,22 +875,32 @@ def initialize_model_parallel(
                                     group_name="dp")
 
 
-def get_sequence_model_parallel_world_size() -> int:
+def get_sp_world_size() -> int:
     """Return world size for the sequence model parallel group."""
     return get_sp_group().world_size
 
 
-def get_sequence_model_parallel_rank() -> int:
+def get_sp_parallel_rank() -> int:
     """Return my rank for the sequence model parallel group."""
     return get_sp_group().rank_in_group
 
 
-def get_data_parallel_world_size() -> int:
+def get_world_size() -> int:
+    """Return world size for the world group."""
+    return get_world_group().world_size
+
+
+def get_world_rank() -> int:
+    """Return my rank for the world group."""
+    return get_world_group().rank
+
+
+def get_dp_world_size() -> int:
     """Return world size for the data parallel group."""
     return get_dp_group().world_size
 
 
-def get_data_parallel_rank() -> int:
+def get_dp_rank() -> int:
     """Return my rank for the data parallel group."""
     return get_dp_group().rank_in_group
 
@@ -916,10 +923,9 @@ def ensure_model_parallel_initialized(
                                   data_parallel_size, backend)
         return
 
-    assert (
-        get_tensor_model_parallel_world_size() == tensor_model_parallel_size
-    ), ("tensor parallel group already initialized, but of unexpected size: "
-        f"{get_tensor_model_parallel_world_size()=} vs. "
+    assert (get_tp_world_size() == tensor_model_parallel_size), (
+        "tensor parallel group already initialized, but of unexpected size: "
+        f"{get_tp_world_size()=} vs. "
         f"{tensor_model_parallel_size=}")
 
     if sequence_model_parallel_size > 1:
@@ -963,12 +969,12 @@ def patch_tensor_parallel_group(tp_group: GroupCoordinator):
         _TP = old_tp_group
 
 
-def get_tensor_model_parallel_world_size() -> int:
+def get_tp_world_size() -> int:
     """Return world size for the tensor model parallel group."""
     return get_tp_group().world_size
 
 
-def get_tensor_model_parallel_rank() -> int:
+def get_tp_rank() -> int:
     """Return my rank for the tensor model parallel group."""
     return get_tp_group().rank_in_group
 
 
@@ -551,7 +551,7 @@ class TrainingArgs(FastVideoArgs):
     gradient_accumulation_steps: int = 0
     learning_rate: float = 0.0
     scale_lr: bool = False
-    lr_scheduler: str = ""
+    lr_scheduler: str = "constant"
     lr_warmup_steps: int = 0
     max_grad_norm: float = 0.0
     gradient_checkpointing: bool = False