hpcaitech
diff --git a/‎.github/workflows/doc_check_on_pr.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/doc_check_on_pr.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎colossalai/pipeline/schedule/zero_bubble_pp.py‎
Lines changed: 13 additions & 0 deletions b/‎colossalai/pipeline/schedule/zero_bubble_pp.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎colossalai/pipeline/weight_grad_store.py‎
Lines changed: 17 additions & 7 deletions b/‎colossalai/pipeline/weight_grad_store.py‎
Lines changed: 17 additions & 7 deletions
diff --git a/‎colossalai/shardformer/layer/__init__.py‎
Lines changed: 11 additions & 2 deletions b/‎colossalai/shardformer/layer/__init__.py‎
Lines changed: 11 additions & 2 deletions
@@ -58,6 +58,7 @@ jobs:
       # there is no main branch, so it's safe to checkout the main branch from the merged branch
       # docer will rebase the remote main branch to the merged branch, so we have to config user
       - name: Make the merged branch main
+
         run: |
           cd ColossalAI
           git checkout -b main
 
@@ -38,6 +38,19 @@ def _wait_p2p(wait_handles: List[torch.cuda.Event]) -> None:
 
 
 class ZeroBubbleVPipeScheduler(PipelineSchedule):
+    r"""
+    ZeroBubbleVPipeScheduler
+
+    Args:
+        stage_manager (PipelineStageManager): If using pipeline parallelism, it's necessary to specify a pipeline stage manager for inter-process communication in pipeline parallelism. Defaults to None, which means not using pipeline parallelism.
+        schedule (List[ScheduledNode]): Schedule for ZeroBubbleVPipe.
+        num_model_chunks (int) : The number of model chunk in a device.
+        num_microbatch (Optional[int]): The number of microbatch.
+        microbatch_size (Optional[int]): The size per microbatch.
+        enable_metadata_cache (bool): whether to enable metadata cache to acclerate communication.
+        overlap_p2p (bool): whether to use overlap_p2p.
+    """
+
     def __init__(
         self,
         stage_manager: PipelineStageManager,
 
@@ -8,7 +8,6 @@ class WeightGradStore:
 
     @classmethod
     def put(cls, total_input, grad_output, weight, func):
-        # func(total_input, grad_output, weight.main_grad)
         cls.cache.append((total_input, grad_output, weight, func))
 
     @classmethod
@@ -18,15 +17,26 @@ def flush(cls, chunk=0):
 
     @classmethod
     def pop(cls, chunk=0):
-        # print(f"chunk id {chunk} queue size {cls.weight_grad_queue[chunk].qsize()}")
         if cls.weight_grad_queue[chunk].qsize() > 0:
             stored_grads = cls.weight_grad_queue[chunk].get()
             for total_input, grad_output, weight, func in stored_grads:
-                if weight.grad is not None:
-                    func(total_input, grad_output, weight.grad)
-                # for first bwd; weight.grad is None, assign grad_weight to weight.grad
+                if isinstance(weight, tuple):
+                    # In order to be hooked into Gemini's '__torch_function__', adding a view operation to weight and bias.
+                    # View will lead to weight ptr change
+                    # weight_cal & weight_origin in tuple, weight_cal use to cal dw, weight_origin use to update
+                    _, weight_origin = weight
+                    if weight_origin.grad is not None:
+                        func(total_input, grad_output, weight_origin.grad)
+                    # for first bwd; weight.grad is None, assign grad_weight to weight.grad
+                    else:
+                        grad_weight = func(total_input, grad_output)
+                        weight_origin.grad = grad_weight
                 else:
-                    grad_weight = func(total_input, grad_output)
-                    weight.grad = grad_weight
+                    if weight.grad is not None:
+                        func(total_input, grad_output, weight.grad)
+                    # for first bwd; weight.grad is None, assign grad_weight to weight.grad
+                    else:
+                        grad_weight = func(total_input, grad_output)
+                        weight.grad = grad_weight
         else:
             raise Exception("Pop empty queue.")
@@ -6,16 +6,24 @@
 from .loss import cross_entropy_1d, dist_cross_entropy
 from .normalization import FusedLayerNorm, FusedRMSNorm, LayerNorm, RMSNorm
 from .parallel_module import ParallelModule
-from .qkv_fused_linear import FusedLinear1D_Col, FusedLinear1D_Row, GPT2FusedLinearConv1D_Col, GPT2FusedLinearConv1D_Row
+from .qkv_fused_linear import (
+    FusedLinear,
+    FusedLinear1D_Col,
+    FusedLinear1D_Row,
+    GPT2FusedLinearConv,
+    GPT2FusedLinearConv1D_Col,
+    GPT2FusedLinearConv1D_Row,
+)
 
 __all__ = [
     "Embedding1D",
     "VocabParallelEmbedding1D",
     "LinearWithGradAccum",
     "Linear1D_Col",
     "Linear1D_Row",
-    "GPT2FusedLinearConv1D_Col",
+    "GPT2FusedLinearConv",
     "GPT2FusedLinearConv1D_Row",
+    "GPT2FusedLinearConv1D_Col",
     "DropoutForParallelInput",
     "DropoutForReplicatedInput",
     "cross_entropy_1d",
@@ -26,6 +34,7 @@
     "FusedLayerNorm",
     "FusedRMSNorm",
     "FusedLinear1D_Col",
+    "FusedLinear",
     "ParallelModule",
     "PaddingEmbedding",
     "PaddingLMHead",