[fix] rm useless comments

duanjunwen · duanjunwen · commit 130b50caac9f · 2024-12-25T14:06:00.000+08:00
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
@@ -142,30 +142,6 @@ def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_f
                     ),
                 )
                 grad_weight = None
-                # if grad.dtype == torch.float32:
-                #     WeightGradStore.put(
-                #         total_input,
-                #         grad_output,
-                #         (weight, weight_origin),
-                #         functools.partial(
-                #             execute_w_pass_grad_accum,
-                #             wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32,
-                #         ),
-                #     )
-                #     grad_weight = None
-                # elif grad.dtype in (torch.float16, torch.bfloat16):
-                #     WeightGradStore.put(
-                #         total_input,
-                #         grad_output,
-                #         (weight, weight_origin),
-                #         functools.partial(
-                #             execute_w_pass_grad_accum,
-                #             wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16,
-                #         ),
-                #     )
-                #     grad_weight = None
-                # else:
-                #     raise RuntimeError("Unsupported gradient type for gradient accumulation fusion")
             else:
                 if grad.dtype == torch.float32:
                     fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, grad)
@@ -261,30 +237,6 @@ def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_f
                     ),
                 )
                 grad_weight = None
-            #     if grad.dtype == torch.float32:
-            #         WeightGradStore.put(
-            #             total_input,
-            #             grad_output,
-            #             (weight, weight_origin),
-            #             functools.partial(
-            #                 execute_w_pass_grad_accum,
-            #                 wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32,
-            #             ),
-            #         )
-            #         grad_weight = None
-            #     elif grad.dtype in (torch.float16, torch.bfloat16):
-            #         WeightGradStore.put(
-            #             total_input,
-            #             grad_output,
-            #             (weight, weight_origin),
-            #             functools.partial(
-            #                 execute_w_pass_grad_accum,
-            #                 wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16,
-            #             ),
-            #         )
-            #         grad_weight = None
-            #     else:
-            #         raise RuntimeError("Unsupported gradient type for gradient accumulation fusion")
             else:
                 if grad.dtype == torch.float32:
                     fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, grad)
@@ -385,30 +337,6 @@ def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_f
                     ),
                 )
                 grad_weight = None
-                # if grad.dtype == torch.float32:
-                #     WeightGradStore.put(
-                #         total_input,
-                #         grad_output,
-                #         weight,
-                #         functools.partial(
-                #             execute_w_pass_grad_accum,
-                #             wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32,
-                #         ),
-                #     )
-                #     grad_weight = None
-                # elif grad.dtype in (torch.float16, torch.bfloat16):
-                #     WeightGradStore.put(
-                #         total_input,
-                #         grad_output,
-                #         weight,
-                #         functools.partial(
-                #             execute_w_pass_grad_accum,
-                #             wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16,
-                #         ),
-                #     )
-                #     grad_weight = None
-                # else:
-                #     raise RuntimeError("Unsupported gradient type for gradient accumulation fusion")
             else:
                 if grad.dtype == torch.float32:
                     fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, grad)
@@ -500,30 +428,6 @@ def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_f
                     ),
                 )
                 grad_weight = None
-                # if grad.dtype == torch.float32:
-                #     WeightGradStore.put(
-                #         total_input,
-                #         grad_output,
-                #         weight,
-                #         functools.partial(
-                #             execute_w_pass_grad_accum,
-                #             wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32,
-                #         ),
-                #     )
-                #     grad_weight = None
-                # elif grad.dtype in (torch.float16, torch.bfloat16):
-                #     WeightGradStore.put(
-                #         total_input,
-                #         grad_output,
-                #         weight,
-                #         functools.partial(
-                #             execute_w_pass_grad_accum,
-                #             wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16,
-                #         ),
-                #     )
-                #     grad_weight = None
-                # else:
-                #     raise RuntimeError("Unsupported gradient type for gradient accumulation fusion")
             else:
                 if grad.dtype == torch.float32:
                     fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, grad)
@@ -761,30 +665,6 @@ def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_f
                     ),
                 )
                 grad_weight = None
-                # if grad.dtype == torch.float32:
-                #     WeightGradStore.put(
-                #         total_input,
-                #         grad_output,
-                #         weight,
-                #         functools.partial(
-                #             execute_w_pass_grad_accum,
-                #             wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32,
-                #         ),
-                #     )
-                #     grad_weight = None
-                # elif grad.dtype in (torch.float16, torch.bfloat16):
-                #     WeightGradStore.put(
-                #         total_input,
-                #         grad_output,
-                #         weight,
-                #         functools.partial(
-                #             execute_w_pass_grad_accum,
-                #             wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16,
-                #         ),
-                #     )
-                #     grad_weight = None
-                # else:
-                #     raise RuntimeError("Unsupported gradient type for gradient accumulation fusion")
             else:
                 if grad.dtype == torch.float32:
                     fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, grad)
@@ -972,30 +852,6 @@ def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_f
                     ),
                 )
                 grad_weight = None
-                # if grad.dtype == torch.float32:
-                #     WeightGradStore.put(
-                #         total_input,
-                #         grad_output,
-                #         weight,
-                #         functools.partial(
-                #             execute_w_pass_grad_accum,
-                #             wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32,
-                #         ),
-                #     )
-                #     grad_weight = None
-                # elif grad.dtype in (torch.float16, torch.bfloat16):
-                #     WeightGradStore.put(
-                #         total_input,
-                #         grad_output,
-                #         weight,
-                #         functools.partial(
-                #             execute_w_pass_grad_accum,
-                #             wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16,
-                #         ),
-                #     )
-                #     grad_weight = None
-                # else:
-                #     raise RuntimeError("Unsupported gradient type for gradient accumulation fusion")
             else:
                 if grad.dtype == torch.float32:
                     fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, grad)
@@ -1169,30 +1025,6 @@ def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_f
                     ),
                 )
                 grad_weight = None
-                # if grad.dtype == torch.float32:
-                #     WeightGradStore.put(
-                #         total_input,
-                #         grad_output,
-                #         (weight, weight_origin),
-                #         functools.partial(
-                #             execute_w_pass_grad_accum,
-                #             wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32,
-                #         ),
-                #     )
-                #     grad_weight = None
-                # elif grad.dtype in (torch.float16, torch.bfloat16):
-                #     WeightGradStore.put(
-                #         total_input,
-                #         grad_output,
-                #         (weight, weight_origin),
-                #         functools.partial(
-                #             execute_w_pass_grad_accum,
-                #             wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16,
-                #         ),
-                #     )
-                #     grad_weight = None
-                # else:
-                #     raise RuntimeError("Unsupported gradient type for gradient accumulation fusion")
             else:
                 if grad.dtype == torch.float32:
                     fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, grad)