hpcaitech
diff --git a/‎colossalai/booster/plugin/hybrid_parallel_plugin.py‎
Lines changed: 4 additions & 4 deletions b/‎colossalai/booster/plugin/hybrid_parallel_plugin.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎colossalai/zero/low_level/bookkeeping/base_store.py‎
Lines changed: 1 addition & 0 deletions b/‎colossalai/zero/low_level/bookkeeping/base_store.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎colossalai/zero/low_level/bookkeeping/bucket_store.py‎
Lines changed: 29 additions & 2 deletions b/‎colossalai/zero/low_level/bookkeeping/bucket_store.py‎
Lines changed: 29 additions & 2 deletions
diff --git a/‎colossalai/zero/low_level/bookkeeping/gradient_store.py‎
Lines changed: 6 additions & 3 deletions b/‎colossalai/zero/low_level/bookkeeping/gradient_store.py‎
Lines changed: 6 additions & 3 deletions
@@ -735,7 +735,7 @@ def _get_grads_to_sync(all_working_grads) -> Union[List[Tensor], None]:
         # Get all working gradients and gradients to be synchronized.
         all_working_grads = _get_all_working_grads()
         grads_to_sync = _get_grads_to_sync(all_working_grads)
-        if self.require_grad_sync and grads_to_sync is not None:
+        if self._grad_store.require_grad_sync and grads_to_sync is not None:
             # Synchronize sequence parallelism gradients if required.
             SeqParallelUtils.allreduce_partial_data_grad(process_group=self.tp_pg, grads=grads_to_sync)
         else:
@@ -759,7 +759,7 @@ def backward(self, loss, retain_graph=False):
         # Call the superclass backward method to compute gradients.
         super().backward(loss, retain_graph)
 
-        if self.require_grad_sync and self.model.shard_config.enable_sequence_parallelism:
+        if self._grad_store.require_grad_sync and self.model.shard_config.enable_sequence_parallelism:
             # If gradient synchronization is required, sync sequence parallelism gradients.
             self._sync_sp_grads()
         else:
@@ -784,7 +784,7 @@ def backward_by_grad(self, tensor, grad):
         # Call the superclass backward_by_grad method to compute gradients.
         super().backward_by_grad(tensor, grad)
 
-        if self.require_grad_sync and self.model.shard_config.enable_sequence_parallelism:
+        if self._grad_store.require_grad_sync and self.model.shard_config.enable_sequence_parallelism:
             # If gradient synchronization is required, sync sequence parallelism gradients.
             self._sync_sp_grads()
         else:
@@ -1272,7 +1272,7 @@ def execute_pipeline(
 
         # run with gradients accumulation
         if model.require_grad_sync == False or (
-            isinstance(optimizer, HybridParallelZeroOptimizer) and optimizer.require_grad_sync == False
+            isinstance(optimizer, HybridParallelZeroOptimizer) and optimizer._grad_store.require_grad_sync == False
         ):
             return outputs
 
 
@@ -6,6 +6,7 @@ class BaseStore:
     def __init__(self, torch_pg: ProcessGroup):
         self._world_size = dist.get_world_size(group=torch_pg)
         self._local_rank = dist.get_rank(group=torch_pg)
+        self.torch_pg = torch_pg
 
     @property
     def world_size(self):
 
@@ -1,16 +1,43 @@
-from typing import Dict
+from typing import Dict, Optional
 
 import torch
+import torch.distributed as dist
 from torch import Tensor
 from torch._utils import _flatten_dense_tensors
 from torch.distributed import ProcessGroup
 
+from colossalai.accelerator import get_accelerator
+
 from .base_store import BaseStore
 
 
 class BucketStore(BaseStore):
-    def __init__(self, torch_pg: ProcessGroup):
+    def __init__(
+        self,
+        torch_pg: ProcessGroup,
+        reduce_bucket_size: int,
+        overlap_communication: bool,
+        communication_dtype: Optional[torch.dtype] = None,
+        moe_extra_dp_process_group: ProcessGroup = None,
+    ):
         super().__init__(torch_pg)
+        self.reduce_bucket_size = reduce_bucket_size
+        # communication params
+        self._overlap_communication = overlap_communication
+        self._communication_dtype = communication_dtype
+        if self._overlap_communication:
+            self.comm_stream = get_accelerator().Stream()
+        self.zero_local_rank = dist.get_rank(group=self.torch_pg)
+        self.zero_world_size = dist.get_world_size(group=self.torch_pg)
+        # extra dp
+        # This group is used to sync moe param, dp_world_size = moe_duplicates * extra_dp_size.
+        # Non moe param will be sync by global dp pg, moe param will be sync by extra dp pg.
+        # Moe param grad is be split as non moe param by global dp pg, and grad will be merged in step.
+        # And moe working and master param are split by extra dp pg.
+        self.moe_extra_dp_pg = moe_extra_dp_process_group
+        if self.moe_extra_dp_pg is not None:
+            self.moe_extra_dp_pg_size = dist.get_world_size(group=self.moe_extra_dp_pg)
+            self.moe_extra_dp_pg_rank = dist.get_rank(group=self.moe_extra_dp_pg)
         self.reset_all()
 
     def reset_all(self) -> None:
 
@@ -6,7 +6,7 @@
 
 
 class GradientStore(BaseStore):
-    def __init__(self, *args, partition_grad: bool = False):
+    def __init__(self, *args, partition_grad: bool = False, require_grad_sync: bool = True):
         super().__init__(*args)
         """
         self._grads_of_params mapping the parameter and its gradient slices
@@ -18,9 +18,12 @@ def __init__(self, *args, partition_grad: bool = False):
         }
         """
         self._grads_of_params = dict()
-        # for zero2, it's `param_id: [grad_local_rank]`
+        # stage 2
+        self._partition_grads = partition_grad
+        # grad accumulation
+        self.require_grad_sync = require_grad_sync
         self._working_index = 0 if partition_grad else self._local_rank
-
+        # for zero2, it's `param_id: [grad_local_rank]`
         self.grad_to_param_mapping = dict()
 
     def get_partitioned_gradients_by_param_id(self, group_id: int, param_id: int) -> List: