[fix] fix weight grad none, err caused by weight ptr change

duanjunwen · duanjunwen · commit ff316c9ddaf0 · 2024-12-24T17:10:36.000+08:00
diff --git a/colossalai/pipeline/weight_grad_store.py b/colossalai/pipeline/weight_grad_store.py
@@ -20,11 +20,23 @@ def pop(cls, chunk=0):
         if cls.weight_grad_queue[chunk].qsize() > 0:
             stored_grads = cls.weight_grad_queue[chunk].get()
             for total_input, grad_output, weight, func in stored_grads:
-                if weight.grad is not None:
-                    func(total_input, grad_output, weight.grad)
-                # for first bwd; weight.grad is None, assign grad_weight to weight.grad
+                if isinstance(weight, tuple):
+                    # In order to be hooked into Gemini's '__torch_function__', adding a view operation to weight and bias.
+                    # View will lead to weight ptr change
+                    # weight_cal & weight_origin in tuple, weight_cal use to cal dw, weight_origin use to update
+                    weight_cal, weight_origin = weight
+                    if weight_origin.grad is not None:
+                        func(total_input, grad_output, weight_origin)
+                    # for first bwd; weight.grad is None, assign grad_weight to weight.grad
+                    else:
+                        grad_weight = func(total_input, grad_output)
+                        weight_origin.grad = grad_weight
                 else:
-                    grad_weight = func(total_input, grad_output)
-                    weight.grad = grad_weight
+                    if weight.grad is not None:
+                        func(total_input, grad_output, weight.grad)
+                    # for first bwd; weight.grad is None, assign grad_weight to weight.grad
+                    else:
+                        grad_weight = func(total_input, grad_output)
+                        weight.grad = grad_weight
         else:
             raise Exception("Pop empty queue.")
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
@@ -96,6 +96,7 @@ def backward(ctx, grad_output):
         use_zbv = ctx.use_zbv
 
         # In order to be hooked into Gemini's '__torch_function__', adding a view operation to weight and bias.
+        weight_origin = weight
         weight = weight.view(weight.shape)
         if bias is not None:
             bias = bias.view(bias.shape)
@@ -130,7 +131,7 @@ def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_f
                     WeightGradStore.put(
                         total_input,
                         grad_output,
-                        weight,
+                        (weight, weight_origin),
                         functools.partial(
                             execute_w_pass_grad_accum,
                             wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32,
@@ -141,7 +142,7 @@ def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_f
                     WeightGradStore.put(
                         total_input,
                         grad_output,
-                        weight,
+                        (weight, weight_origin),
                         functools.partial(
                             execute_w_pass_grad_accum,
                             wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16,
@@ -164,7 +165,7 @@ def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_f
                 WeightGradStore.put(
                     total_input,
                     grad_output,
-                    weight,
+                    (weight, weight_origin),
                     functools.partial(
                         execute_w_pass,
                         wgrad_gemm_func=torch.matmul,
@@ -212,6 +213,7 @@ def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_f
             return wgrad_gemm_func(_input_.t(), _grad_output_)
 
         # In order to be hooked into Gemini's '__torch_function__', adding a view operation to weight and bias.
+        weight_origin = weight
         weight = weight.view(weight.shape)
         if bias is not None:
             bias = bias.view(bias.shape)
@@ -232,7 +234,7 @@ def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_f
                     WeightGradStore.put(
                         total_input,
                         grad_output,
-                        weight,
+                        (weight, weight_origin),
                         functools.partial(
                             execute_w_pass_grad_accum,
                             wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32,
@@ -243,7 +245,7 @@ def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_f
                     WeightGradStore.put(
                         total_input,
                         grad_output,
-                        weight,
+                        (weight, weight_origin),
                         functools.partial(
                             execute_w_pass_grad_accum,
                             wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16,
@@ -266,7 +268,7 @@ def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_f
                 WeightGradStore.put(
                     total_input,
                     grad_output,
-                    weight,
+                    (weight, weight_origin),
                     functools.partial(
                         execute_w_pass,
                         wgrad_gemm_func=torch.matmul,
@@ -1026,6 +1028,7 @@ def backward(ctx, grad_output):
         use_zbv = ctx.use_zbv
 
         # In order to be hooked into Gemini's '__torch_function__', adding a view operation to weight and bias. Used in FusedLayerNorm
+        weight_origin = weight
         weight = weight.view(weight.shape)
         if use_bias:
             bias = bias.view(bias.shape)
@@ -1064,7 +1067,7 @@ def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_f
                     WeightGradStore.put(
                         total_input,
                         grad_output,
-                        weight,
+                        (weight, weight_origin),
                         functools.partial(
                             execute_w_pass_grad_accum,
                             wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32,
@@ -1075,7 +1078,7 @@ def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_f
                     WeightGradStore.put(
                         total_input,
                         grad_output,
-                        weight,
+                        (weight, weight_origin),
                         functools.partial(
                             execute_w_pass_grad_accum,
                             wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16,
@@ -1098,7 +1101,7 @@ def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_f
                 WeightGradStore.put(
                     total_input,
                     grad_output,
-                    weight,
+                    (weight, weight_origin),
                     functools.partial(
                         execute_w_pass,
                         wgrad_gemm_func=torch.matmul,
diff --git a/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py b/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py
@@ -185,11 +185,10 @@ def check_linear_conv_1d_with_weight_grad_store(lazy_init: bool, seq_parallel_mo
 
     # check the input gradients & weight gradients
     assert_close(out.grad, gather_out.grad)
-    # TODO:linear_base.weight.grad is None; But not none in WeightGradStore
-    # assert_close(linear.weight.grad, linear_base.weight.grad)
+    assert_close(linear.weight.grad, linear_base.weight.grad)
 
 
-@parameterize("lazy_init", [False, True])
+@parameterize("lazy_init", [False])
 @parameterize("seq_parallel_mode", ["split_gather", None])
 def check_gpt2_qkv_fused_linear_1d(lazy_init: bool, seq_parallel_mode: bool):
     check_linear_conv_1d_col(lazy_init, seq_parallel_mode)