Add bias a weight we need to sync as well (#307)

thomasw21 · web-flow · commit 0d0d84c845f6 · 2022-07-07T18:35:28.000+02:00
diff --git a/megatron/arguments.py b/megatron/arguments.py
@@ -375,8 +375,8 @@ def _add_network_size_args(parser):
                        ', needs to be divisible by TP size and `make-vocab-size-divisible-by`.')
     group.add_argument('--layernorm-epsilon', type=float, default=1e-5,
                        help='Layer norm epsilon.')
-    group.add_argument('--layernorm-tp-auto-sync', action='store_true',
-                       help='Force syncing layernorm params across TP ranks in forward. '
+    group.add_argument('--sync-tp-duplicated-parameters', action='store_true',
+                       help='Force syncing duplicated params across TP ranks in forward. '
                        'This is a workaround for an unresolved bug leading to TP ranks '
                        'getting out of sync with each other.')
     group.add_argument('--apply-residual-connection-post-layernorm',
diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
@@ -84,7 +84,7 @@ def __init__(self, normalized_shape, eps=1e-5):
     self.reset_parameters()
 
     args = get_args()
-    self.layernorm_tp_auto_sync = args.layernorm_tp_auto_sync
+    self.layernorm_tp_auto_sync = args.sync_tp_duplicated_parameters
 
     self.use_meg_ds_fused_layer_norm = (
       args.bf16 # Current Meg-DS cuda kernel has better throughput than torch.nn.LayerNorm
diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py
@@ -423,6 +423,7 @@ def __init__(self, input_size, output_size, bias=True,
         else:
             self.register_parameter('bias', None)
 
+        self.bias_tp_auto_sync = args.sync_tp_duplicated_parameters
 
 
     def forward(self, input_):
@@ -435,6 +436,10 @@ def forward(self, input_):
         output_parallel = F.linear(input_parallel, self.weight)
         # All-reduce across all the partitions.
         output_ = reduce_from_tensor_model_parallel_region(output_parallel)
+
+        if self.bias_tp_auto_sync:
+            torch.distributed.all_reduce(self.bias, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
+
         if not self.skip_bias_add:
             output = output_ + self.bias if self.bias is not None else output_
             output_bias = None
diff --git a/tests/test_training.py b/tests/test_training.py
@@ -167,7 +167,7 @@ def get_variation_config(self, variation, output_dir, n_samples=None):
                 --clip-grad 1.0
                 --weight-decay 1e-1
                 --embed-layernorm
-                --layernorm-tp-auto-sync
+                --sync-tp-duplicated-parameters
                 --fp16
 
                 --log-level debug