@@ -384,16 +384,15 @@ index a8f4abfcd..f33f6f05e 100755
384384
385385 if self.config.recompute_method == 'uniform':
386386diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py
387- index e2705bd9f..83a947c00 100644
387+ index e2705bd9f..a0aa109b5 100644
388388--- a/megatron/core/transformer/transformer_config.py
389389+++ b/megatron/core/transformer/transformer_config.py
390- @@ -210,6 +210,10 @@ class TransformerConfig(ModelParallelConfig):
390+ @@ -210,6 +210,9 @@ class TransformerConfig(ModelParallelConfig):
391391 attention_output_gate: bool = False
392392 """Whether to apply output gate to the attention layers."""
393393
394394+ post_self_attn_layernorm: bool = False
395395+ post_mlp_layernorm: bool = False
396- + use_gated_attention: bool = False
397396+
398397 test_mode: bool = False
399398 """Whether to run real-time tests."""
@@ -469,21 +468,20 @@ index 3ea405770..5a42001b9 100644
469468 # discard the output of the pre-mlp layernorm and register the recompute
470469 # as a gradient hook of mlp_output_with_bias[0]
471470diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
472- index b267c8a81..def4ce809 100644
471+ index b267c8a81..83736acdc 100644
473472--- a/megatron/training/arguments.py
474473+++ b/megatron/training/arguments.py
475- @@ -1398,6 +1398,10 @@ def core_transformer_config_from_args(args, config_class=None):
474+ @@ -1398,6 +1398,9 @@ def core_transformer_config_from_args(args, config_class=None):
476475
477476 kw_args['inference_sampling_seed'] = args.seed
478477
479478+ kw_args['post_self_attn_layernorm'] = args.post_self_attn_layernorm
480479+ kw_args['post_mlp_layernorm'] = args.post_mlp_layernorm
481- + kw_args['use_gated_attention'] = args.use_gated_attention
482480+
483481 # handle quantization config
484482 # NOTE: Kitchen arguments are only added to the namespace when
485483 # Kitchen library is available.
486- @@ -1764,6 +1768 ,12 @@ def _add_network_size_args(parser):
484+ @@ -1764,6 +1767 ,12 @@ def _add_network_size_args(parser):
487485 action='store_true',
488486 help='If set, use original BERT residula connection '
489487 'ordering.')
0 commit comments