From cc6dd1d302a655dba08b1fe0162eaf3a4c926ab6 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 21 Jun 2022 15:48:00 +0200
Subject: [PATCH 01/25] WIP

---
 megatron/model/__init__.py                    |   1 +
 megatron/model/gpt_model.py                   |   6 +-
 megatron/model/module.py                      |   4 +-
 megatron/model/shared_t5_model.py             | 189 ++++++++++++++++++
 megatron/text_generation_utils.py             |   4 +-
 megatron/utils.py                             |  11 +-
 pretrain_gpt.py                               |   6 +-
 pretrain_prefix_lm.py                         |   7 +-
 pretrain_shared_t5.py                         | 177 ++++++++++++++++
 .../test_sampling.py                          |   6 +-
 tasks/zeroshot_gpt/evaluate.py                |   4 +-
 tests/test_tensor_parallel.py                 |   4 +-
 12 files changed, 395 insertions(+), 24 deletions(-)
 create mode 100644 megatron/model/shared_t5_model.py
 create mode 100644 pretrain_shared_t5.py

diff --git a/megatron/model/__init__.py b/megatron/model/__init__.py
index baf54e455..0662d93bb 100644
--- a/megatron/model/__init__.py
+++ b/megatron/model/__init__.py
@@ -18,6 +18,7 @@
 from .distributed import DistributedDataParallel
 from .bert_model import BertModel
 from .gpt_model import GPTModel, GPTModelPipe
+from .shared_t5_model import SharedT5ModelPipe
 from .t5_model import T5Model
 from .language_model import get_language_model
 from .module import Float16Module
diff --git a/megatron/model/gpt_model.py b/megatron/model/gpt_model.py
index 31d33a91b..880fea7d1 100644
--- a/megatron/model/gpt_model.py
+++ b/megatron/model/gpt_model.py
@@ -21,7 +21,7 @@
 from megatron import get_args
 from megatron import mpu
 from megatron.enums import AttnMaskType
-from .module import MegatronModule, fp32_to_float16
+from .module import MegatronModule, fp32_to_16bit
 
 from .language_model import parallel_lm_logits
 from .language_model import get_language_model
@@ -213,9 +213,9 @@ def __init__(
 
         def _to_float16(inputs):
             if args.fp16:
-                return fp32_to_float16(inputs, lambda v: v.half())
+                return fp32_to_16bit(inputs, lambda v: v.half())
             elif args.bf16:
-                return fp32_to_float16(inputs, lambda v: v.bfloat16())
+                return fp32_to_16bit(inputs, lambda v: v.bfloat16())
             else:
                 return inputs
 
diff --git a/megatron/model/module.py b/megatron/model/module.py
index df92d95a9..3841dd140 100644
--- a/megatron/model/module.py
+++ b/megatron/model/module.py
@@ -122,7 +122,7 @@ def conversion_helper(val, conversion):
     return rtn
 
 
-def fp32_to_float16(val, float16_convertor):
+def fp32_to_16bit(val, float16_convertor):
     """Convert fp32 `val` to fp16/bf16"""
     def half_conversion(val):
         val_typecheck = val
@@ -168,7 +168,7 @@ def float16_convertor(val):
 
     def forward(self, *inputs, **kwargs):
         if mpu.is_pipeline_first_stage():
-            inputs = fp32_to_float16(inputs, self.float16_convertor)
+            inputs = fp32_to_16bit(inputs, self.float16_convertor)
         outputs = self.module(*inputs, **kwargs)
         if mpu.is_pipeline_last_stage():
             outputs = float16_to_fp32(outputs)
diff --git a/megatron/model/shared_t5_model.py b/megatron/model/shared_t5_model.py
new file mode 100644
index 000000000..b375c6742
--- /dev/null
+++ b/megatron/model/shared_t5_model.py
@@ -0,0 +1,189 @@
+import torch
+from deepspeed import PipelineModule
+from deepspeed.runtime.pipe import TiedLayerSpec, LayerSpec
+from torch.nn import LayerNorm
+
+from megatron.enums import AttnMaskType, LayerType
+
+from megatron.model.transformer import ParallelTransformerLayerPipe
+
+from megatron.model.language_model import EmbeddingPipe, parallel_lm_logits
+
+from megatron.model.utils import init_method_normal, scaled_init_method_normal
+
+from megatron import get_args, mpu
+
+from megatron.model.module import MegatronModule, fp32_to_16bit, float16_to_fp32
+
+def cross_entropy(output, labels):
+    labels, loss_mask = labels[0], labels[1]
+
+    losses = mpu.vocab_parallel_cross_entropy(output.contiguous().float(), labels)
+
+    expected_number_of_tokens = loss_mask.sum()
+
+    loss_mask = loss_mask.view(-1)
+    loss = torch.sum(losses.view(-1) * loss_mask) / expected_number_of_tokens
+    return loss
+
+class SharedT5ModelPipe(PipelineModule, MegatronModule):
+    """Share encoder decoder language model."""
+
+    def __init__(
+        self,
+        num_tokentypes=0,
+        parallel_output=True,
+    ):
+        args = get_args()
+        self.parallel_output = parallel_output
+
+        init_method = init_method_normal(args.init_method_std)
+
+        self.specs = []
+
+        def _to_16bit(inputs):
+            if args.fp16:
+                return fp32_to_16bit(inputs, lambda v: v.half())
+            elif args.bf16:
+                return fp32_to_16bit(inputs, lambda v: v.bfloat16())
+            else:
+                return inputs
+
+        self.specs.append(lambda inputss: tuple(_to_16bit(inputs) for inputs in inputss))
+
+        # Embedding layer
+        self.specs.append(TiedLayerSpec('embed',
+                                        EmbeddingPipe,
+                                        args.hidden_size,
+                                        args.padded_vocab_size,
+                                        args.hidden_dropout,
+                                        init_method=init_method,
+                                        num_tokentypes=num_tokentypes,
+                                        tied_weight_attr='word_embeddings_weight'))
+
+        assert hasattr(args, 'attn_mask'), "Deepspeed integration should have attention mask s"
+        if args.fp32_residual_connection:
+            self.specs.append(lambda x: x.transpose(0, 1).contiguous().float())
+        else:
+            self.specs.append(lambda x: x.transpose(0, 1).contiguous())
+
+        ### -----  Encoder -----
+        for layer_idx in range(args.num_layers):
+            self.specs.append(
+                TiedLayerSpec(
+                    f"block_{layer_idx}",
+                    ParallelTransformerLayerPipe,
+                    init_method=init_method,
+                    # Inputs: (input_tokens, target_tokens,
+                    forward_fn=lambda module, *inputs: ,
+                    output_layer_init_method=scaled_init_method_normal(args.init_method_std,
+                                                                       args.num_layers),
+                    layer_type=LayerType.encoder,
+                    layer_number=layer_idx,
+                    self_attn_mask_type=AttnMaskType.causal,
+                    tied_weight_attrs=["input_layernorm", "self_attention", "post_attention_layernorm", "mlp"]
+                ))
+
+        # Final layernorm after encoder layers
+        self.specs.append(
+            TiedLayerSpec(
+                "final_layer_norm",
+                LayerNorm,
+                args.hidden_size,
+                eps=args.layernorm_epsilon
+            ))
+
+        # Decoder
+        for layer_idx in range(args.num_layers):
+            self.specs.append(
+                TiedLayerSpec(
+                    f"block_{layer_idx}",
+                    ParallelTransformerLayerPipe,
+                    init_method=init_method,
+                    output_layer_init_method=scaled_init_method_normal(args.init_method_std,
+                                                                       args.num_layers),
+                    layer_number=layer_idx,
+                    layer_type=LayerType.decoder,
+                    self_attn_mask_type=AttnMaskType.padding,
+                    tied_weight_attrs=["input_layernorm", "self_attention", "post_attention_layernorm", "mlp"]
+                )
+            )
+
+        # Final layernorm after decoder layers
+        self.specs.append(
+            TiedLayerSpec(
+                "final_layer_norm",
+                LayerNorm,
+                args.hidden_size,
+                eps=args.layernorm_epsilon
+            ))
+
+        # Undo data format change
+        self.specs.append(lambda x: x.transpose(0, 1).contiguous())
+
+        def _logits_helper(embedding, lm_output):
+            """A wrapper to massage inputs/outputs from pipeline. """
+            return parallel_lm_logits(
+                lm_output,
+                embedding.word_embeddings_weight,
+                self.parallel_output)
+
+        self.specs.append(
+            TiedLayerSpec('embed',
+                          EmbeddingPipe,
+                          args.hidden_size,
+                          args.padded_vocab_size,
+                          args.hidden_dropout,
+                          init_method=init_method,
+                          num_tokentypes=num_tokentypes,
+                          forward_fn=_logits_helper,
+                          tied_weight_attr='word_embeddings_weight')
+        )
+
+        if not hasattr(args, 'attn_mask'):
+            # We drop attention mask from the pipeline
+            self.specs.append(lambda x: x[0])
+
+        # Final layernorm after transformer layers
+        self.specs.append(
+            TiedLayerSpec(
+                "final_layer_norm",
+                LayerNorm,
+                args.hidden_size,
+                eps=args.layernorm_epsilon
+            ))
+
+        # Undo data format change
+        self.specs.append(lambda x: x.transpose(0, 1).contiguous())
+
+        # Convert to fp32 if needed
+        if args.fp16 or args.bf16:
+            self.specs.append(float16_to_fp32)
+
+        if args.checkpoint_activations:
+            interval = args.checkpoint_num_layers
+        else:
+            interval = 0
+
+        from deepspeed.runtime.pipe.topology import PipeModelDataParallelTopology
+        topo = PipeModelDataParallelTopology(num_pp=mpu.get_pipeline_model_parallel_world_size(),
+                                             num_mp=mpu.get_tensor_model_parallel_world_size(),
+                                             num_dp=mpu.get_data_parallel_world_size())
+
+        # here one can extend the regex to include more layers to be counted towards partitioning,
+        # e.g. 'type:transformer|embedding' will add up all the transformer blocks and also the first
+        # and last embedding layers and then partition that transformers+2 layers - so to get a good
+        # balance you may want to use less transformer layers
+        #
+        # caveat emptor: the current implementation of PP fails unless each stage has at least one
+        # transformer layer
+        if args.pp_partition_method is not None:
+            partition_method = args.pp_partition_method
+        else:
+            partition_method = 'type:transformer'
+
+        super().__init__(layers=self.specs,
+                         loss_fn=cross_entropy,
+                         topology=topo,
+                         activation_checkpoint_interval=interval,
+                         partition_method=partition_method)
diff --git a/megatron/text_generation_utils.py b/megatron/text_generation_utils.py
index 7a98b5d35..bd0ec59d8 100644
--- a/megatron/text_generation_utils.py
+++ b/megatron/text_generation_utils.py
@@ -26,7 +26,7 @@
 from megatron import get_args
 from megatron import get_tokenizer
 from megatron import mpu
-from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model
+from megatron.utils import get_attention_masks_and_position_ids, unwrap_model
 from megatron.p2p_communication import recv_forward, send_forward
 
 # These are needed to unwrap the model, would be nice to put these in megatron.utils if possible?
@@ -42,7 +42,7 @@ def get_batch(context_tokens):
     # Move to GPU.
     tokens = context_tokens.view(args.micro_batch_size, -1).contiguous().cuda()
     # Get the attention mask and position ids.
-    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
+    attention_mask, _, position_ids = get_attention_masks_and_position_ids(
         tokens,
         tokenizer.eod,
         args.reset_position_ids,
diff --git a/megatron/utils.py b/megatron/utils.py
index 98d2f611c..29a4e9795 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -151,7 +151,8 @@ def check_adlr_autoresume_termination(iteration, model,
         sys.exit(0)
 
 
-def get_ltor_masks_and_position_ids(
+
+def get_attention_masks_and_position_ids(
         data,
         eod_token,
         reset_position_ids,
@@ -159,6 +160,7 @@ def get_ltor_masks_and_position_ids(
         eod_mask_loss,
         prefix_indices,
         loss_on_targets_only,
+        ltor=True,
     ):
     """
     Build masks and position id for left to right model.
@@ -177,9 +179,10 @@ def get_ltor_masks_and_position_ids(
         att_mask_batch = micro_batch_size
     else:
         att_mask_batch = 1
-    attention_mask = torch.tril(torch.ones(
-        (att_mask_batch, seq_length, seq_length), device=data.device)).view(
-            att_mask_batch, 1, seq_length, seq_length)
+    attention_mask = torch.ones((att_mask_batch, seq_length, seq_length), device=data.device)
+    if ltor:
+        attention_mask = torch.tril(attention_mask)
+    attention_mask = attention_mask.view(att_mask_batch, 1, seq_length, seq_length)
 
     # Loss mask.
     loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
index 04f1b3b57..c1e9a11f0 100644
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -25,7 +25,7 @@
 from megatron.data.gpt_dataset import build_train_valid_test_datasets, build_dataset_group
 from megatron.model import GPTModel, GPTModelPipe
 from megatron.training import pretrain
-from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices
+from megatron.utils import get_attention_masks_and_position_ids, get_prefix_indices
 from megatron.utils import average_losses_across_data_parallel_group
 
 import deepspeed
@@ -110,7 +110,7 @@ def get_batch(data_iterator):
     tokens = tokens_[:, :-1].contiguous()
 
     # Get the masks and postition ids.
-    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+    attention_mask, loss_mask, position_ids = get_attention_masks_and_position_ids(
         tokens,
         tokenizer.eod,
         args.reset_position_ids,
@@ -141,7 +141,7 @@ def get_batch_pipe(data):
     tokens = tokens_[:, :-1].contiguous()
 
     # Get the masks and position ids.
-    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+    attention_mask, loss_mask, position_ids = get_attention_masks_and_position_ids(
         tokens,
         tokenizer.eod,
         args.reset_position_ids,
diff --git a/pretrain_prefix_lm.py b/pretrain_prefix_lm.py
index 391186e75..0e9ed019a 100644
--- a/pretrain_prefix_lm.py
+++ b/pretrain_prefix_lm.py
@@ -25,7 +25,7 @@
 from megatron.data.gpt_dataset import build_train_valid_test_datasets, build_dataset_group
 from megatron.model import GPTModel, GPTModelPipe
 from megatron.training import pretrain
-from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_
+from megatron.utils import get_attention_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_
 from megatron.utils import average_losses_across_data_parallel_group
 
 import deepspeed
@@ -97,7 +97,7 @@ def get_batch(data_iterator):
     )
 
     # Get the masks and postition ids.
-    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+    attention_mask, loss_mask, position_ids = get_attention_masks_and_position_ids(
         tokens,
         tokenizer.eod,
         args.reset_position_ids,
@@ -131,6 +131,7 @@ def get_batch_pipe(data):
     tokens = tokens_[:, :-1].contiguous()
 
     # Prefix
+    # TODO @thomasw21 actually since this step is random, we need to make sure that random state are synchronized. Otherwise we need to broadcast after this step.
     prefix_indices = get_prefix_indices(
         tokens,
         tokenizer.eod,
@@ -139,7 +140,7 @@ def get_batch_pipe(data):
     )
 
     # Get the masks and position ids.
-    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+    attention_mask, loss_mask, position_ids = get_attention_masks_and_position_ids(
         tokens,
         tokenizer.eod,
         args.reset_position_ids,
diff --git a/pretrain_shared_t5.py b/pretrain_shared_t5.py
new file mode 100644
index 000000000..592f808d7
--- /dev/null
+++ b/pretrain_shared_t5.py
@@ -0,0 +1,177 @@
+import torch
+from functools import partial
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import get_timers
+from megatron import get_tokenizer
+from megatron import mpu
+from megatron.data.gpt_dataset import build_train_valid_test_datasets, build_dataset_group
+from megatron.model import SharedT5ModelPipe
+from megatron.training import pretrain
+from megatron.utils import get_attention_masks_and_position_ids, get_prefix_indices
+from megatron.utils import average_losses_across_data_parallel_group
+
+import deepspeed
+from deepspeed.runtime.utils import see_memory_usage
+import os
+
+try:
+    from torch.distributed.elastic.multiprocessing.errors import record
+except ImportError:
+    # noop
+    def record(fn):
+        return fn
+
+def model_provider(pre_process=True, post_process=True):
+    """Build the model."""
+
+    print_rank_0('building GPT model ...')
+    see_memory_usage(f"Before Building Model", force=True)
+
+    args = get_args()
+
+    with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(),
+                             remote_device=None if args.remote_device == 'none' else args.remote_device,
+                             config_dict_or_path=args.deepspeed_config,
+                             enabled=args.zero_stage == 3,
+                             mpu=mpu):
+        if args.deepspeed:
+            # TODO @thomasw21: fix this for PP > 1 (the issue is that you're passing two values that require grad)
+            assert mpu.get_pipeline_model_parallel_world_size() != 1, "PP > 1 is not supported yet"
+
+            # TODO: actually I'm fairly confident that you don't need the causal mask here as it's handled with `AttnMaskType`
+            # # Precompute the attention mask and store it in args. This avoids having to
+            # # pipeline it as an activation during training. The mask is constant, and thus
+            # # we can reuse it.
+            # attention_mask = torch.tril(torch.ones(
+            #     (1, args.seq_length, args.seq_length), device=torch.cuda.current_device())).view(
+            #         1, 1, args.seq_length, args.seq_length)
+            #
+            # # Convert attention mask to binary:
+            # attention_mask = (attention_mask < 0.5)
+            # if args.fp16:
+            #     attention_mask = attention_mask.half()
+            # elif args.bf16:
+            #     attention_mask = attention_mask.bfloat16()
+            #
+            # # must be bool or the training crashes expecting bool, but getting Half
+            # args.attn_mask = attention_mask.to(torch.bool)
+
+            model = SharedT5ModelPipe(
+                num_tokentypes=0,
+                parallel_output=True
+            )
+            # This is a hack to give us a reference to get_batch_pipe from within training.py
+            # We need to call model.set_batch_fn after deepspeed.initialize
+            model._megatron_batch_fn = get_batch_pipe
+        else:
+            assert False, "Require deepspeed to be running"
+    see_memory_usage(f"After Building Model", force=True)
+    return model
+
+
+def get_batch_pipe(data):
+    """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`"""
+    raise NotImplementedError("Waiting for MLM data loader to work")
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    # Items and their type.
+    # TODO @thomasw21
+    keys = ["input_tokens", "target_tokens"]
+    datatype = torch.int64
+
+    # Broadcast data.
+    data_b = mpu.broadcast_data(keys, data, datatype)
+
+    # Unpack.
+    input_tokens = data_b["input_tokens"].long()
+    target_tokens = data_b["target_tokens"].long()[:,:-1].contiguous()
+    label_tokens = data_b["target_tokens"].long()[:,1:].contiguous()
+
+    # Get the masks and position ids.
+    input_attention_mask, _, input_position_ids = get_attention_masks_and_position_ids(
+        input_tokens,
+        tokenizer.eod,
+        reset_position_ids=False, # TODO @thomasw21 doesn't work out of the box
+        reset_attention_mask=False, # TODO @thomasw21 doesn't work out of the box
+        eod_mask_loss=False, # TODO @thomasw21 doesn't work out of the box
+        prefix_indices=None,
+        loss_on_targets_only=False,
+        ltor=False
+    )
+    target_attention_mask, target_loss_mask, target_position_ids = get_attention_masks_and_position_ids(
+        target_tokens,
+        tokenizer.eod,
+        reset_position_ids=False,  # TODO @thomasw21 doesn't work out of the box
+        reset_attention_mask=False,  # TODO @thomasw21 doesn't work out of the box
+        eod_mask_loss=False,  # TODO @thomasw21 doesn't work out of the box
+        prefix_indices=None,
+        loss_on_targets_only=args.loss_on_targets_only,
+        ltor=True
+    )
+
+    return ((input_tokens, input_attention_mask, input_position_ids), (target_tokens, target_attention_mask, target_position_ids)), (label_tokens, target_loss_mask)
+
+
+def train_valid_test_datasets_provider(train_val_test_num_samples):
+    """Build train, valid, and test datasets."""
+    raise NotImplementedError("Waiting for MLM data loader")
+    args = get_args()
+    train_ds, valid_ds, test_ds = None, None, None
+
+    print_rank_0('> building train, validation, and test datasets for GPT ...')
+    # Option 1 of data loading using --data-path
+
+    if args.data_path:
+        train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+            data_prefix=args.data_path,
+            data_impl=args.data_impl,
+            splits_string=args.split,
+            train_valid_test_num_samples=train_val_test_num_samples,
+            seq_length=args.seq_length,
+            seed=args.seed,
+            skip_warmup=(not args.mmap_warmup))
+    # Option 2 of data loading using --(train|valid|test)-weighted-split-paths
+    elif args.train_weighted_split_paths:
+        assigned_train_valid_test = []
+        if args.train_weighted_split_paths is not None:
+            train_ds = []
+            assigned_train_valid_test.append("train")
+        if args.valid_weighted_split_paths is not None:
+            valid_ds = []
+            assigned_train_valid_test.append("valid")
+        if args.test_weighted_split_paths is not None:
+            test_ds = []
+            assigned_train_valid_test.append("test")
+
+        for s in assigned_train_valid_test:
+            data_groups = zip(eval(f"args.{s}_weighted_split_paths"),
+                                eval(f"args.{s}_weighted_split_weights"),
+                                eval(f"args.{s}_weighted_split_splits"),
+                                eval(f"args.{s}_weighted_split_names"))
+            for paths, weights, splits, name in data_groups:
+                d = build_dataset_group(name, paths, weights, splits,
+                                        args.data_impl,
+                                        train_val_test_num_samples,
+                                        args.seq_length, args.seed,
+                                        (not args.mmap_warmup),
+                                        train_valid_test=s)
+                eval(f"{s}_ds").append(d)
+    else:
+        raise NotImplementedError("No dataloading argument passed")
+
+    print_rank_0("> finished creating MLM datasets ...")
+    return train_ds, valid_ds, test_ds
+
+@record
+def main():
+    pretrain(
+        train_valid_test_datasets_provider,
+        model_provider,
+        # TODO @thomasw21: make it work without DS.
+        forward_step_func=None,
+        args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/test_multiple_dataset_sampling/test_sampling.py b/scripts/test_multiple_dataset_sampling/test_sampling.py
index 2d5326c8c..8bed75c2a 100644
--- a/scripts/test_multiple_dataset_sampling/test_sampling.py
+++ b/scripts/test_multiple_dataset_sampling/test_sampling.py
@@ -25,7 +25,7 @@
 from megatron.data.gpt_dataset import build_train_valid_test_datasets
 from megatron.model import GPTModel, GPTModelPipe
 from megatron.training import pretrain
-from megatron.utils import get_ltor_masks_and_position_ids
+from megatron.utils import get_attention_masks_and_position_ids
 from megatron.utils import average_losses_across_data_parallel_group
 
 import deepspeed
@@ -117,7 +117,7 @@ def get_batch(data_iterator):
     tokens = tokens_[:, :-1].contiguous()
 
     # Get the masks and postition ids.
-    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+    attention_mask, loss_mask, position_ids = get_attention_masks_and_position_ids(
         tokens,
         tokenizer.eod,
         args.reset_position_ids,
@@ -144,7 +144,7 @@ def get_batch_pipe(data):
     tokens = tokens_[:, :-1].contiguous()
 
     # Get the masks and postition ids.
-    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+    attention_mask, loss_mask, position_ids = get_attention_masks_and_position_ids(
         tokens,
         tokenizer.eod,
         args.reset_position_ids,
diff --git a/tasks/zeroshot_gpt/evaluate.py b/tasks/zeroshot_gpt/evaluate.py
index 090533c24..b17c76848 100644
--- a/tasks/zeroshot_gpt/evaluate.py
+++ b/tasks/zeroshot_gpt/evaluate.py
@@ -26,7 +26,7 @@
 from megatron.checkpointing import load_checkpoint
 from megatron.model.gpt_model import GPTModel
 from megatron.training import get_model
-from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model
+from megatron.utils import get_attention_masks_and_position_ids, unwrap_model
 from megatron.p2p_communication import recv_forward, send_forward
 from tasks.finetune_utils import build_data_loader
 
@@ -72,7 +72,7 @@ def process_batch(batch):
     tokens = tokens_[:, :-1].contiguous()
 
     # Get the masks and position ids.
-    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
+    attention_mask, _, position_ids = get_attention_masks_and_position_ids(
         tokens,
         tokenizer.eod,
         args.reset_position_ids,
diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index 25921c12a..a0d257404 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -18,7 +18,7 @@
 from multiprocessing import Pool
 from megatron.checkpointing import save_checkpoint
 
-from megatron.utils import get_ltor_masks_and_position_ids
+from megatron.utils import get_attention_masks_and_position_ids
 
 @require_deepspeed
 @require_torch_multi_gpu
@@ -98,7 +98,7 @@ def infer_model(args):
                 def create_model_inputs(tokens):
                     args = get_args()
 
-                    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+                    attention_mask, loss_mask, position_ids = get_attention_masks_and_position_ids(
                         tokens,
                         tokenizer.eod,
                         args.reset_position_ids,

From fbc1d04068e4d12f3f22d2a8108e436eabb074c1 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 21 Jun 2022 17:40:47 +0200
Subject: [PATCH 02/25] Fix shared T5

---
 megatron/model/shared_t5_model.py | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/megatron/model/shared_t5_model.py b/megatron/model/shared_t5_model.py
index b375c6742..908568f83 100644
--- a/megatron/model/shared_t5_model.py
+++ b/megatron/model/shared_t5_model.py
@@ -49,7 +49,7 @@ def _to_16bit(inputs):
             else:
                 return inputs
 
-        self.specs.append(lambda inputss: tuple(_to_16bit(inputs) for inputs in inputss))
+        self.specs.append(lambda inputss: tuple(tuple(_to_16bit(inputs)) for inputs in inputss))
 
         # Embedding layer
         self.specs.append(TiedLayerSpec('embed',
@@ -57,15 +57,18 @@ def _to_16bit(inputs):
                                         args.hidden_size,
                                         args.padded_vocab_size,
                                         args.hidden_dropout,
+                                        forward_fn=lambda module, inputs, targets: (module(*inputs), module(*targets)),
                                         init_method=init_method,
                                         num_tokentypes=num_tokentypes,
                                         tied_weight_attr='word_embeddings_weight'))
 
         assert hasattr(args, 'attn_mask'), "Deepspeed integration should have attention mask s"
+        # Drop everything beside tokens
+        self.specs.append(lambda inputs, targets: (inputs[0], targets[0]))
         if args.fp32_residual_connection:
-            self.specs.append(lambda x: x.transpose(0, 1).contiguous().float())
+            self.specs.append(lambda input_tokens, target_tokens: (input_tokens.transpose(0, 1).contiguous().float(), target_tokens.transpose(0, 1).contiguous().float()))
         else:
-            self.specs.append(lambda x: x.transpose(0, 1).contiguous())
+            self.specs.append(lambda input_tokens, target_tokens: (input_tokens.transpose(0, 1).contiguous(), target_tokens.transpose(0, 1).contiguous()))
 
         ### -----  Encoder -----
         for layer_idx in range(args.num_layers):
@@ -74,22 +77,21 @@ def _to_16bit(inputs):
                     f"block_{layer_idx}",
                     ParallelTransformerLayerPipe,
                     init_method=init_method,
-                    # Inputs: (input_tokens, target_tokens,
-                    forward_fn=lambda module, *inputs: ,
+                    forward_fn=lambda module, input_tokens, target_tokens: (module(input_tokens), target_tokens),
                     output_layer_init_method=scaled_init_method_normal(args.init_method_std,
                                                                        args.num_layers),
                     layer_type=LayerType.encoder,
                     layer_number=layer_idx,
                     self_attn_mask_type=AttnMaskType.causal,
-                    tied_weight_attrs=["input_layernorm", "self_attention", "post_attention_layernorm", "mlp"]
+                    tied_weight_attrs=["self_attention", "mlp"]
                 ))
 
         # Final layernorm after encoder layers
         self.specs.append(
-            TiedLayerSpec(
-                "final_layer_norm",
+            LayerSpec(
                 LayerNorm,
                 args.hidden_size,
+                forward_fn=lambda module, input_tokens, target_tokens: (module(input_tokens), target_tokens),
                 eps=args.layernorm_epsilon
             ))
 
@@ -100,19 +102,22 @@ def _to_16bit(inputs):
                     f"block_{layer_idx}",
                     ParallelTransformerLayerPipe,
                     init_method=init_method,
+                    forward_fn=lambda module, encoded_tokens, target_tokens: (encoded_tokens, module(target_tokens, encoder_output=encoded_tokens)),
                     output_layer_init_method=scaled_init_method_normal(args.init_method_std,
                                                                        args.num_layers),
                     layer_number=layer_idx,
                     layer_type=LayerType.decoder,
                     self_attn_mask_type=AttnMaskType.padding,
-                    tied_weight_attrs=["input_layernorm", "self_attention", "post_attention_layernorm", "mlp"]
+                    tied_weight_attrs=["self_attention", "mlp"]
                 )
             )
 
+        # Drop encoded tokens
+        self.specs.append(lambda encoded_tokens, target_tokens: target_tokens)
+
         # Final layernorm after decoder layers
         self.specs.append(
-            TiedLayerSpec(
-                "final_layer_norm",
+            LayerSpec(
                 LayerNorm,
                 args.hidden_size,
                 eps=args.layernorm_epsilon

From 91a20aa6195d186370ed15f000f117ede6ececa8 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 27 Jun 2022 17:42:14 +0200
Subject: [PATCH 03/25] WIP

---
 pretrain_shared_t5.py | 57 +++++++++++++++++--------------------------
 1 file changed, 22 insertions(+), 35 deletions(-)

diff --git a/pretrain_shared_t5.py b/pretrain_shared_t5.py
index 592f808d7..ee2d72a06 100644
--- a/pretrain_shared_t5.py
+++ b/pretrain_shared_t5.py
@@ -1,19 +1,15 @@
 import torch
-from functools import partial
 from megatron import get_args
 from megatron import print_rank_0
-from megatron import get_timers
 from megatron import get_tokenizer
 from megatron import mpu
-from megatron.data.gpt_dataset import build_train_valid_test_datasets, build_dataset_group
+from megatron.data.mlm_dataset import build_train_valid_test_datasets, build_dataset_group
 from megatron.model import SharedT5ModelPipe
 from megatron.training import pretrain
-from megatron.utils import get_attention_masks_and_position_ids, get_prefix_indices
-from megatron.utils import average_losses_across_data_parallel_group
+from megatron.utils import get_attention_masks_and_position_ids
 
 import deepspeed
 from deepspeed.runtime.utils import see_memory_usage
-import os
 
 try:
     from torch.distributed.elastic.multiprocessing.errors import record
@@ -39,24 +35,6 @@ def model_provider(pre_process=True, post_process=True):
             # TODO @thomasw21: fix this for PP > 1 (the issue is that you're passing two values that require grad)
             assert mpu.get_pipeline_model_parallel_world_size() != 1, "PP > 1 is not supported yet"
 
-            # TODO: actually I'm fairly confident that you don't need the causal mask here as it's handled with `AttnMaskType`
-            # # Precompute the attention mask and store it in args. This avoids having to
-            # # pipeline it as an activation during training. The mask is constant, and thus
-            # # we can reuse it.
-            # attention_mask = torch.tril(torch.ones(
-            #     (1, args.seq_length, args.seq_length), device=torch.cuda.current_device())).view(
-            #         1, 1, args.seq_length, args.seq_length)
-            #
-            # # Convert attention mask to binary:
-            # attention_mask = (attention_mask < 0.5)
-            # if args.fp16:
-            #     attention_mask = attention_mask.half()
-            # elif args.bf16:
-            #     attention_mask = attention_mask.bfloat16()
-            #
-            # # must be bool or the training crashes expecting bool, but getting Half
-            # args.attn_mask = attention_mask.to(torch.bool)
-
             model = SharedT5ModelPipe(
                 num_tokentypes=0,
                 parallel_output=True
@@ -72,12 +50,10 @@ def model_provider(pre_process=True, post_process=True):
 
 def get_batch_pipe(data):
     """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`"""
-    raise NotImplementedError("Waiting for MLM data loader to work")
     args = get_args()
     tokenizer = get_tokenizer()
 
     # Items and their type.
-    # TODO @thomasw21
     keys = ["input_tokens", "target_tokens"]
     datatype = torch.int64
 
@@ -116,7 +92,7 @@ def get_batch_pipe(data):
 
 def train_valid_test_datasets_provider(train_val_test_num_samples):
     """Build train, valid, and test datasets."""
-    raise NotImplementedError("Waiting for MLM data loader")
+
     args = get_args()
     train_ds, valid_ds, test_ds = None, None, None
 
@@ -129,9 +105,12 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
             data_impl=args.data_impl,
             splits_string=args.split,
             train_valid_test_num_samples=train_val_test_num_samples,
-            seq_length=args.seq_length,
+            sequence_length=args.seq_length,
+            noise_density=args.noise_density,
+            mean_noise_span_length=args.mean_noise_span_length,
             seed=args.seed,
-            skip_warmup=(not args.mmap_warmup))
+            skip_warmup=(not args.mmap_warmup)
+        )
     # Option 2 of data loading using --(train|valid|test)-weighted-split-paths
     elif args.train_weighted_split_paths:
         assigned_train_valid_test = []
@@ -151,12 +130,20 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
                                 eval(f"args.{s}_weighted_split_splits"),
                                 eval(f"args.{s}_weighted_split_names"))
             for paths, weights, splits, name in data_groups:
-                d = build_dataset_group(name, paths, weights, splits,
-                                        args.data_impl,
-                                        train_val_test_num_samples,
-                                        args.seq_length, args.seed,
-                                        (not args.mmap_warmup),
-                                        train_valid_test=s)
+                d = build_dataset_group(
+                    dataset_group_name=name,
+                    paths=paths,
+                    weights=weights,
+                    splits=splits,
+                    data_impl=args.data_impl,
+                    train_valid_test_num_samples=train_val_test_num_samples,
+                    seq_length=args.seq_length,
+                    noise_density=args.noise_density,
+                    mean_noise_span_length=args.mean_noise_span_length,
+                    seed=args.seed,
+                    skip_warmup=(not args.mmap_warmup),
+                    train_valid_test=s
+                )
                 eval(f"{s}_ds").append(d)
     else:
         raise NotImplementedError("No dataloading argument passed")

From e55b6031cdf209974b0aa74fc40e1a57ee8d7b01 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 27 Jun 2022 19:13:38 +0200
Subject: [PATCH 04/25] Shared t5 tests

---
 megatron/arguments.py                         |   4 +-
 megatron/data/mlm_dataset.py                  |  10 +-
 ...ed_t5.py => pretrain_shared_t5_with_mlm.py |   4 +-
 tests/test_training.py                        | 126 ++++++++++++++++++
 4 files changed, 139 insertions(+), 5 deletions(-)
 rename pretrain_shared_t5.py => pretrain_shared_t5_with_mlm.py (97%)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 2211e17dc..869ebd5c1 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -925,8 +925,8 @@ def __call__(self, parser, args, values, option_string=None):
                        'specific positions. This option tries to un-bias the loss by reweighting loss on specific '
                        'positions based on how frequently we train on that position.'
                        'This is mostly used for prefix_lm training')
-    group.add_argument("--noise_density", type=float, default=None, help="Span corruption noise density")
-    group.add_argument("--mean_noise_span_length", type=int, default=None, help="Span corruption mean noise span length")
+    group.add_argument("--noise-density", type=float, default=None, help="Span corruption noise density")
+    group.add_argument("--mean-noise-span-length", type=int, default=None, help="Span corruption mean noise span length")
 
 
     return parser
diff --git a/megatron/data/mlm_dataset.py b/megatron/data/mlm_dataset.py
index 024b4c345..ca791788f 100644
--- a/megatron/data/mlm_dataset.py
+++ b/megatron/data/mlm_dataset.py
@@ -3,7 +3,7 @@
 import numpy as np
 import torch
 
-from megatron import print_rank_0, get_tokenizer
+from megatron import print_rank_0, get_tokenizer, get_args
 from megatron.data.blendable_dataset import BlendableDataset
 from megatron.data.dataset_utils import get_datasets_weights_and_num_samples, get_split_by_range_
 from megatron.data.dataset_utils import get_train_valid_test_split_, get_indexed_dataset_
@@ -325,6 +325,14 @@ def __init__(
         assert len(self.sentinel_token_ids) > 0, "Provide the argument --vocab-extra-ids 100 to the script"
         assert len(self.sentinel_token_ids) >= self.num_noise_spans, "Not enough sentinel tokens, please add more"
 
+        args = get_args()
+        if hasattr(args, "encoder_seq_length") and  args.encoder_seq_length is not None:
+            # T5 style
+            assert self.inputs_length == args.encoder_seq_length
+            assert self.targets_length == args.decoder_seq_length
+        else:
+            assert self.inputs_length + self.targets_length == args.seq_length
+
     def __len__(self):
         return len(self.samples_mapping)
 
diff --git a/pretrain_shared_t5.py b/pretrain_shared_t5_with_mlm.py
similarity index 97%
rename from pretrain_shared_t5.py
rename to pretrain_shared_t5_with_mlm.py
index ee2d72a06..08b91b1ea 100644
--- a/pretrain_shared_t5.py
+++ b/pretrain_shared_t5_with_mlm.py
@@ -105,7 +105,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
             data_impl=args.data_impl,
             splits_string=args.split,
             train_valid_test_num_samples=train_val_test_num_samples,
-            sequence_length=args.seq_length,
+            sequence_length=args.encoder_seq_length + args.decoder_seq_length,
             noise_density=args.noise_density,
             mean_noise_span_length=args.mean_noise_span_length,
             seed=args.seed,
@@ -137,7 +137,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
                     splits=splits,
                     data_impl=args.data_impl,
                     train_valid_test_num_samples=train_val_test_num_samples,
-                    seq_length=args.seq_length,
+                    seq_length=args.encoder_seq_length + args.decoder_seq_length,
                     noise_density=args.noise_density,
                     mean_noise_span_length=args.mean_noise_span_length,
                     seed=args.seed,
diff --git a/tests/test_training.py b/tests/test_training.py
index c77cb9af2..e73897302 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -592,3 +592,129 @@ def test_skip_train_iteration(self):
         train_iterations = range(1,10)
         for i in train_iterations:
             self.assertTrue(f"iteration {i:8d}/" in cs.out)
+
+    def test_pretrain_shared_t5_mlm(self):
+        # all in one test
+        src_dir = self.src_dir
+        data_dir = f"{self.data_dir}/gpt2"
+        output_dir = self.get_auto_remove_tmp_dir()  # "./xxx", after=False)
+        logs_dir = f"{output_dir}/logs"
+        Path(logs_dir).mkdir(parents=True, exist_ok=True)
+
+        pp_size, tp_size, dp_size = get_3d_dimensions()
+        num_gpus = pp_size * tp_size * dp_size
+
+        # TODO @thomasw21 fix once t5 supports pipeline parallelism
+        dp_size *= pp_size
+        pp_size = 1
+
+        n_samples = 200  # about 37 iterations
+        exit_interval = 20  # some samples in the first half and then some more in the 2nd half after resume
+        noise_density=0.15
+        mean_noise_span_length=3
+        encoder_seq_length = 512
+        decoder_seq_length = 114 # imposed by `noise_density=0.15` and `input_sequence_length = 512`
+
+
+        args = f"""
+            --tensor-model-parallel-size {tp_size}
+            --pipeline-model-parallel-size {pp_size}
+            --distributed-backend nccl
+
+            --num-layers 2
+            --hidden-size 64
+            --num-attention-heads 2
+            --decoder-seq-length {decoder_seq_length}
+            --encoder-seq-length {encoder_seq_length}
+            --max-position-embeddings 1024
+            --micro-batch-size 1
+            --rampup-batch-size 2 2 {n_samples}
+            --global-batch-size 16
+            --train-samples {n_samples}
+
+            --optimizer adam
+            --adam-beta1 0.9
+            --adam-beta2 0.95
+            --adam-eps 1e-8
+            --lr 1e-4
+            --lr-warmup-samples 5
+            --clip-grad 1.0
+            --weight-decay 1e-1
+            --fp16
+
+            --log-interval 5
+            --save-interval 10
+            --eval-interval 10
+            --eval-iters 5
+            --checkpoint-activations
+            --exit-interval {exit_interval}
+
+            --merge-file {data_dir}/gpt2-tiny-merges.txt
+            --vocab-file {data_dir}/gpt2-tiny-vocab.json
+            --log-path {logs_dir}
+            --save {output_dir}/checkpoints
+            --load {output_dir}/checkpoints
+            --data-path {data_dir}/meg-gpt2-openwebtext_text_document
+            --noise-density {noise_density}
+            --mean-noise-span-length {mean_noise_span_length}
+            --tensorboard-dir {output_dir}/tensorboard
+            --tensorboard-queue-size 5
+            --log-timers-to-tensorboard
+            --log-batch-size-to-tensorboard
+            --log-validation-ppl-to-tensorboard
+
+            --log-level debug
+        """.split()
+
+        ds_args = f"""
+            --deepspeed
+            --deepspeed_config {self.test_file_dir_str}/ds_config.json
+            --zero-stage 1
+            --deepspeed-activation-checkpointing
+        """.split()
+
+        script = [f"{src_dir}/pretrain_shated_t5_with_mlm.py"]
+        launcher = get_launcher(num_gpus)
+
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        # 1. test training from scratch (no checkpoint)
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test deepspeed is running
+        self.assertIn("DeepSpeed info", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
+
+        # test there should be no checkpoint this round
+        self.assertIn(f"Unable to find latest file at {output_dir}/checkpoints/latest", cs.out)
+
+        # test checkpoint saving
+        self.assertIn("successfully saved checkpoint at iteration", cs.out)
+
+        # test tensorboard
+        tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
+        self.assertEqual(len(tensorboard_files), 1, "tensorboard files")
+
+        # 2. test training from checkpoint: resume
+        # now do it again, this time resuming from the checkpoint
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test checkpoint loading
+        self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
+
+        # test checkpoint saving
+        self.assertIn("successfully saved checkpoint at iteration", cs.out)
+
+        # test tensorboard (1 file from the first run, plus 1 now)
+        tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
+        self.assertEqual(len(tensorboard_files), 2, "tensorboard files")
+

From b5568fba73121d1f7dba66340715766086eafdd5 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 27 Jun 2022 19:29:09 +0200
Subject: [PATCH 05/25] Woops

---
 tests/test_training.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_training.py b/tests/test_training.py
index e73897302..6f5ba1f70 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -673,7 +673,7 @@ def test_pretrain_shared_t5_mlm(self):
             --deepspeed-activation-checkpointing
         """.split()
 
-        script = [f"{src_dir}/pretrain_shated_t5_with_mlm.py"]
+        script = [f"{src_dir}/pretrain_shared_t5_with_mlm.py"]
         launcher = get_launcher(num_gpus)
 
         cmd = launcher + script + args + ds_args

From 50cf6d9c4db2471f4497b19e7e63ec8cc6e56e1a Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 27 Jun 2022 19:43:58 +0200
Subject: [PATCH 06/25] Woops

---
 pretrain_shared_t5_with_mlm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pretrain_shared_t5_with_mlm.py b/pretrain_shared_t5_with_mlm.py
index 08b91b1ea..28e8a9186 100644
--- a/pretrain_shared_t5_with_mlm.py
+++ b/pretrain_shared_t5_with_mlm.py
@@ -33,7 +33,7 @@ def model_provider(pre_process=True, post_process=True):
                              mpu=mpu):
         if args.deepspeed:
             # TODO @thomasw21: fix this for PP > 1 (the issue is that you're passing two values that require grad)
-            assert mpu.get_pipeline_model_parallel_world_size() != 1, "PP > 1 is not supported yet"
+            assert mpu.get_pipeline_model_parallel_world_size() == 1, "PP > 1 is not supported yet"
 
             model = SharedT5ModelPipe(
                 num_tokentypes=0,

From dcb2d6101cf617985d8f426e6f09ea36aef09ae1 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 27 Jun 2022 19:50:23 +0200
Subject: [PATCH 07/25] Hack my way into fix attn_mask

---
 pretrain_shared_t5_with_mlm.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pretrain_shared_t5_with_mlm.py b/pretrain_shared_t5_with_mlm.py
index 28e8a9186..09ee1e3ab 100644
--- a/pretrain_shared_t5_with_mlm.py
+++ b/pretrain_shared_t5_with_mlm.py
@@ -35,6 +35,9 @@ def model_provider(pre_process=True, post_process=True):
             # TODO @thomasw21: fix this for PP > 1 (the issue is that you're passing two values that require grad)
             assert mpu.get_pipeline_model_parallel_world_size() == 1, "PP > 1 is not supported yet"
 
+            # TODO @thomasw21 hack to bypass a specific check
+            args.attn_mask = None
+
             model = SharedT5ModelPipe(
                 num_tokentypes=0,
                 parallel_output=True

From 04cdc2e027040f9927dcacd8cd7ab647e50cb386 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 27 Jun 2022 19:52:45 +0200
Subject: [PATCH 08/25] DS has poor default

---
 megatron/model/shared_t5_model.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/megatron/model/shared_t5_model.py b/megatron/model/shared_t5_model.py
index 908568f83..38b38cd6c 100644
--- a/megatron/model/shared_t5_model.py
+++ b/megatron/model/shared_t5_model.py
@@ -83,6 +83,7 @@ def _to_16bit(inputs):
                     layer_type=LayerType.encoder,
                     layer_number=layer_idx,
                     self_attn_mask_type=AttnMaskType.causal,
+                    tied_weight_attr=None,
                     tied_weight_attrs=["self_attention", "mlp"]
                 ))
 
@@ -108,6 +109,7 @@ def _to_16bit(inputs):
                     layer_number=layer_idx,
                     layer_type=LayerType.decoder,
                     self_attn_mask_type=AttnMaskType.padding,
+                    tied_weight_attr=None,
                     tied_weight_attrs=["self_attention", "mlp"]
                 )
             )

From ffbe3bf5b00fcd304897b99d75a89f87245ed22c Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 27 Jun 2022 20:06:49 +0200
Subject: [PATCH 09/25] Maybe this is better

---
 megatron/model/shared_t5_model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/model/shared_t5_model.py b/megatron/model/shared_t5_model.py
index 38b38cd6c..98e373a4b 100644
--- a/megatron/model/shared_t5_model.py
+++ b/megatron/model/shared_t5_model.py
@@ -91,8 +91,8 @@ def _to_16bit(inputs):
         self.specs.append(
             LayerSpec(
                 LayerNorm,
-                args.hidden_size,
                 forward_fn=lambda module, input_tokens, target_tokens: (module(input_tokens), target_tokens),
+                hidden_size=args.hidden_size,
                 eps=args.layernorm_epsilon
             ))
 
@@ -121,7 +121,7 @@ def _to_16bit(inputs):
         self.specs.append(
             LayerSpec(
                 LayerNorm,
-                args.hidden_size,
+                hidden_size=args.hidden_size,
                 eps=args.layernorm_epsilon
             ))
 

From 90ee34c6af56bed886efe096a33f4fb1fbb069a8 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 27 Jun 2022 20:23:17 +0200
Subject: [PATCH 10/25] Maybe this is better

---
 megatron/model/shared_t5_model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/model/shared_t5_model.py b/megatron/model/shared_t5_model.py
index 98e373a4b..38b38cd6c 100644
--- a/megatron/model/shared_t5_model.py
+++ b/megatron/model/shared_t5_model.py
@@ -91,8 +91,8 @@ def _to_16bit(inputs):
         self.specs.append(
             LayerSpec(
                 LayerNorm,
+                args.hidden_size,
                 forward_fn=lambda module, input_tokens, target_tokens: (module(input_tokens), target_tokens),
-                hidden_size=args.hidden_size,
                 eps=args.layernorm_epsilon
             ))
 
@@ -121,7 +121,7 @@ def _to_16bit(inputs):
         self.specs.append(
             LayerSpec(
                 LayerNorm,
-                hidden_size=args.hidden_size,
+                args.hidden_size,
                 eps=args.layernorm_epsilon
             ))
 

From ed3dbeacb0232d6169beb974abdecb0948c2cb74 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 27 Jun 2022 20:25:09 +0200
Subject: [PATCH 11/25] SEP is only defined for HFTokenizer

---
 tests/test_training.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_training.py b/tests/test_training.py
index 6f5ba1f70..e52981e70 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -649,8 +649,8 @@ def test_pretrain_shared_t5_mlm(self):
             --checkpoint-activations
             --exit-interval {exit_interval}
 
-            --merge-file {data_dir}/gpt2-tiny-merges.txt
-            --vocab-file {data_dir}/gpt2-tiny-vocab.json
+            --tokenizer-type PretrainedFromHF
+            --tokenizer-name-or-path gpt2
             --log-path {logs_dir}
             --save {output_dir}/checkpoints
             --load {output_dir}/checkpoints

From c93b563941454d05022ac4ea21ac199a4864af75 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 27 Jun 2022 20:26:02 +0200
Subject: [PATCH 12/25] add --vocab-extra-ids 100

---
 tests/test_training.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_training.py b/tests/test_training.py
index e52981e70..539f0044a 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -651,6 +651,7 @@ def test_pretrain_shared_t5_mlm(self):
 
             --tokenizer-type PretrainedFromHF
             --tokenizer-name-or-path gpt2
+            --vocab-extra-ids 100
             --log-path {logs_dir}
             --save {output_dir}/checkpoints
             --load {output_dir}/checkpoints

From ef205ddff6149d2a7e75578a195c33d88bbc216b Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 27 Jun 2022 20:42:06 +0200
Subject: [PATCH 13/25] Hopefully this fixes MLM

---
 megatron/data/mlm_dataset.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/megatron/data/mlm_dataset.py b/megatron/data/mlm_dataset.py
index ca791788f..6f962a684 100644
--- a/megatron/data/mlm_dataset.py
+++ b/megatron/data/mlm_dataset.py
@@ -297,13 +297,14 @@ def __init__(
         # according to `noise_density` and `mean_noise_span_length`. We can also define the label length accordingly.
         number_of_raw_tokens, inputs_length, targets_length, num_noise_spans = compute_input_and_target_lengths(
             # +1 is used so that we can compute the as autoregressive systems require us to add one more token.
-            sequence_length=self.sequence_length + 1,
+            sequence_length=self.sequence_length,
             noise_density=self.noise_density,
             mean_noise_span_length=self.mean_noise_span_length
         )
-        self.number_of_raw_tokens = number_of_raw_tokens
         self.inputs_length = inputs_length
-        self.targets_length = targets_length
+        # As the loss we add a token at the end
+        self.number_of_raw_tokens = number_of_raw_tokens + 1
+        self.targets_length = targets_length +1
         self.num_noise_spans = num_noise_spans
 
         # Build the samples mapping.

From c71b3799767265373f1597cec6b08067de491a01 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 27 Jun 2022 20:43:58 +0200
Subject: [PATCH 14/25] Hopefully this fixes MLM

---
 megatron/data/mlm_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/data/mlm_dataset.py b/megatron/data/mlm_dataset.py
index 6f962a684..6d0289cf4 100644
--- a/megatron/data/mlm_dataset.py
+++ b/megatron/data/mlm_dataset.py
@@ -330,7 +330,7 @@ def __init__(
         if hasattr(args, "encoder_seq_length") and  args.encoder_seq_length is not None:
             # T5 style
             assert self.inputs_length == args.encoder_seq_length
-            assert self.targets_length == args.decoder_seq_length
+            assert self.targets_length == args.decoder_seq_length + 1
         else:
             assert self.inputs_length + self.targets_length == args.seq_length
 

From 9da75889ac0c570f2d4bd6dc7d7f66c258ebecc1 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 27 Jun 2022 20:46:11 +0200
Subject: [PATCH 15/25] Fix MLM datasets length

---
 megatron/data/mlm_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/data/mlm_dataset.py b/megatron/data/mlm_dataset.py
index 6d0289cf4..63b8f5358 100644
--- a/megatron/data/mlm_dataset.py
+++ b/megatron/data/mlm_dataset.py
@@ -335,7 +335,7 @@ def __init__(
             assert self.inputs_length + self.targets_length == args.seq_length
 
     def __len__(self):
-        return len(self.samples_mapping)
+        return len(self._gpt_dataset)
 
     def __getitem__(self, idx):
         if isinstance(idx, slice):

From 07f0ce5793e8faf0a99e71c8482e9cf50a665c2f Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 27 Jun 2022 20:51:38 +0200
Subject: [PATCH 16/25] I think GPT2 tokenizer might be missing <sep> token

---
 megatron/data/mlm_dataset.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/megatron/data/mlm_dataset.py b/megatron/data/mlm_dataset.py
index 63b8f5358..d8f4450b4 100644
--- a/megatron/data/mlm_dataset.py
+++ b/megatron/data/mlm_dataset.py
@@ -323,6 +323,7 @@ def __init__(
         tokenizer = get_tokenizer()
         self.sep_id = tokenizer.sep
         self.sentinel_token_ids = tokenizer.additional_special_tokens_ids
+        assert self.sep_id is not None, "MLM dataset requires tokenizer to have a <sep> token"
         assert len(self.sentinel_token_ids) > 0, "Provide the argument --vocab-extra-ids 100 to the script"
         assert len(self.sentinel_token_ids) >= self.num_noise_spans, "Not enough sentinel tokens, please add more"
 

From 12626f619d5a2e6bd834528649a13f3f2cb2e506 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 27 Jun 2022 22:01:44 +0200
Subject: [PATCH 17/25] I think GPT2 tokenizer might be missing <sep> token

---
 megatron/data/mlm_dataset.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/megatron/data/mlm_dataset.py b/megatron/data/mlm_dataset.py
index d8f4450b4..87028d31e 100644
--- a/megatron/data/mlm_dataset.py
+++ b/megatron/data/mlm_dataset.py
@@ -321,7 +321,9 @@ def __init__(
 
         # Vocab stuff.
         tokenizer = get_tokenizer()
-        self.sep_id = tokenizer.sep
+        # TODO @thomasw21 find if overloading eod is acceptable.
+        # self.sep_id = tokenizer.sep
+        self.sep_id = tokenizer.eod
         self.sentinel_token_ids = tokenizer.additional_special_tokens_ids
         assert self.sep_id is not None, "MLM dataset requires tokenizer to have a <sep> token"
         assert len(self.sentinel_token_ids) > 0, "Provide the argument --vocab-extra-ids 100 to the script"

From 52c42b251ed0f4a08dc892472073272ea7ede68d Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 27 Jun 2022 22:19:33 +0200
Subject: [PATCH 18/25] Flatten to pass tuples of tensor instead

---
 megatron/model/shared_t5_model.py | 2 +-
 pretrain_shared_t5_with_mlm.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/model/shared_t5_model.py b/megatron/model/shared_t5_model.py
index 38b38cd6c..f8a1dc91d 100644
--- a/megatron/model/shared_t5_model.py
+++ b/megatron/model/shared_t5_model.py
@@ -57,7 +57,7 @@ def _to_16bit(inputs):
                                         args.hidden_size,
                                         args.padded_vocab_size,
                                         args.hidden_dropout,
-                                        forward_fn=lambda module, inputs, targets: (module(*inputs), module(*targets)),
+                                        forward_fn=lambda module, input_tokens, input_attention_mask, input_position_ids, target_tokens, target_attention_mask, target_position_ids: (module(input_tokens, input_attention_mask, input_position_ids), module(target_tokens, target_attention_mask, target_position_ids)),
                                         init_method=init_method,
                                         num_tokentypes=num_tokentypes,
                                         tied_weight_attr='word_embeddings_weight'))
diff --git a/pretrain_shared_t5_with_mlm.py b/pretrain_shared_t5_with_mlm.py
index 09ee1e3ab..7adf5ad34 100644
--- a/pretrain_shared_t5_with_mlm.py
+++ b/pretrain_shared_t5_with_mlm.py
@@ -90,7 +90,7 @@ def get_batch_pipe(data):
         ltor=True
     )
 
-    return ((input_tokens, input_attention_mask, input_position_ids), (target_tokens, target_attention_mask, target_position_ids)), (label_tokens, target_loss_mask)
+    return (input_tokens, input_attention_mask, input_position_ids, target_tokens, target_attention_mask, target_position_ids), (label_tokens, target_loss_mask)
 
 
 def train_valid_test_datasets_provider(train_val_test_num_samples):

From b25803bfec22131f753838fcdf10aa6a99f2f86a Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 27 Jun 2022 22:44:18 +0200
Subject: [PATCH 19/25] Lambda function can't unpack tuples

---
 megatron/model/shared_t5_model.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/megatron/model/shared_t5_model.py b/megatron/model/shared_t5_model.py
index f8a1dc91d..fc9755397 100644
--- a/megatron/model/shared_t5_model.py
+++ b/megatron/model/shared_t5_model.py
@@ -57,7 +57,7 @@ def _to_16bit(inputs):
                                         args.hidden_size,
                                         args.padded_vocab_size,
                                         args.hidden_dropout,
-                                        forward_fn=lambda module, input_tokens, input_attention_mask, input_position_ids, target_tokens, target_attention_mask, target_position_ids: (module(input_tokens, input_attention_mask, input_position_ids), module(target_tokens, target_attention_mask, target_position_ids)),
+                                        forward_fn=lambda module, input_and_target: (module(*(input_and_target[:3])), module(*(input_and_target[3:]))),
                                         init_method=init_method,
                                         num_tokentypes=num_tokentypes,
                                         tied_weight_attr='word_embeddings_weight'))
@@ -77,7 +77,7 @@ def _to_16bit(inputs):
                     f"block_{layer_idx}",
                     ParallelTransformerLayerPipe,
                     init_method=init_method,
-                    forward_fn=lambda module, input_tokens, target_tokens: (module(input_tokens), target_tokens),
+                    forward_fn=lambda module, input_and_target: (module(input_and_target[0]), input_and_target[1]),
                     output_layer_init_method=scaled_init_method_normal(args.init_method_std,
                                                                        args.num_layers),
                     layer_type=LayerType.encoder,
@@ -92,7 +92,7 @@ def _to_16bit(inputs):
             LayerSpec(
                 LayerNorm,
                 args.hidden_size,
-                forward_fn=lambda module, input_tokens, target_tokens: (module(input_tokens), target_tokens),
+                forward_fn=lambda module, input_and_target: (module(input_and_target[0]), input_and_target[1]),
                 eps=args.layernorm_epsilon
             ))
 
@@ -103,7 +103,8 @@ def _to_16bit(inputs):
                     f"block_{layer_idx}",
                     ParallelTransformerLayerPipe,
                     init_method=init_method,
-                    forward_fn=lambda module, encoded_tokens, target_tokens: (encoded_tokens, module(target_tokens, encoder_output=encoded_tokens)),
+                    forward_fn=lambda module, encoded_and_target: (
+                    encoded_and_target[0], module(encoded_and_target[1], encoder_output=encoded_and_target[0])),
                     output_layer_init_method=scaled_init_method_normal(args.init_method_std,
                                                                        args.num_layers),
                     layer_number=layer_idx,

From 26d9dce163b55487de34e548ac6b95e02ba96008 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 27 Jun 2022 23:03:23 +0200
Subject: [PATCH 20/25] Woops

---
 megatron/model/shared_t5_model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/model/shared_t5_model.py b/megatron/model/shared_t5_model.py
index fc9755397..4ad131fdd 100644
--- a/megatron/model/shared_t5_model.py
+++ b/megatron/model/shared_t5_model.py
@@ -49,7 +49,7 @@ def _to_16bit(inputs):
             else:
                 return inputs
 
-        self.specs.append(lambda inputss: tuple(tuple(_to_16bit(inputs)) for inputs in inputss))
+        self.specs.append(lambda inputss: tuple(_to_16bit(inputs) for inputs in inputss))
 
         # Embedding layer
         self.specs.append(TiedLayerSpec('embed',
@@ -57,7 +57,7 @@ def _to_16bit(inputs):
                                         args.hidden_size,
                                         args.padded_vocab_size,
                                         args.hidden_dropout,
-                                        forward_fn=lambda module, input_and_target: (module(*(input_and_target[:3])), module(*(input_and_target[3:]))),
+                                        forward_fn=lambda module, input_and_target: (module(input_and_target[:3]), module(input_and_target[3:])),
                                         init_method=init_method,
                                         num_tokentypes=num_tokentypes,
                                         tied_weight_attr='word_embeddings_weight'))

From fc0560ac2420d4b4caed4d0006a1cadb34d5c354 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 27 Jun 2022 23:06:50 +0200
Subject: [PATCH 21/25] More dirty fixes

---
 pretrain_shared_t5_with_mlm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pretrain_shared_t5_with_mlm.py b/pretrain_shared_t5_with_mlm.py
index 7adf5ad34..c98ae8081 100644
--- a/pretrain_shared_t5_with_mlm.py
+++ b/pretrain_shared_t5_with_mlm.py
@@ -90,7 +90,7 @@ def get_batch_pipe(data):
         ltor=True
     )
 
-    return (input_tokens, input_attention_mask, input_position_ids, target_tokens, target_attention_mask, target_position_ids), (label_tokens, target_loss_mask)
+    return (input_tokens, input_position_ids, input_attention_mask, target_tokens, target_position_ids, target_attention_mask), (label_tokens, target_loss_mask)
 
 
 def train_valid_test_datasets_provider(train_val_test_num_samples):

From 8578ab36a1162bf6d7ddc947d8e5b683b8a73df4 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 27 Jun 2022 23:17:50 +0200
Subject: [PATCH 22/25] Woops

---
 megatron/model/shared_t5_model.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/megatron/model/shared_t5_model.py b/megatron/model/shared_t5_model.py
index 4ad131fdd..c52390988 100644
--- a/megatron/model/shared_t5_model.py
+++ b/megatron/model/shared_t5_model.py
@@ -64,11 +64,11 @@ def _to_16bit(inputs):
 
         assert hasattr(args, 'attn_mask'), "Deepspeed integration should have attention mask s"
         # Drop everything beside tokens
-        self.specs.append(lambda inputs, targets: (inputs[0], targets[0]))
+        # self.specs.append(lambda inputs, targets: (inputs[0], targets[0]))
         if args.fp32_residual_connection:
-            self.specs.append(lambda input_tokens, target_tokens: (input_tokens.transpose(0, 1).contiguous().float(), target_tokens.transpose(0, 1).contiguous().float()))
+            self.specs.append(lambda input_and_target: (input_and_target[0].transpose(0, 1).contiguous().float(), input_and_target[1].transpose(0, 1).contiguous().float()))
         else:
-            self.specs.append(lambda input_tokens, target_tokens: (input_tokens.transpose(0, 1).contiguous(), target_tokens.transpose(0, 1).contiguous()))
+            self.specs.append(lambda input_and_target: (input_and_target[0].transpose(0, 1).contiguous(), input_and_target[1].transpose(0, 1).contiguous()))
 
         ### -----  Encoder -----
         for layer_idx in range(args.num_layers):
@@ -103,8 +103,7 @@ def _to_16bit(inputs):
                     f"block_{layer_idx}",
                     ParallelTransformerLayerPipe,
                     init_method=init_method,
-                    forward_fn=lambda module, encoded_and_target: (
-                    encoded_and_target[0], module(encoded_and_target[1], encoder_output=encoded_and_target[0])),
+                    forward_fn=lambda module, encoded_and_target: (encoded_and_target[0], module(encoded_and_target[1], encoder_output=encoded_and_target[0])),
                     output_layer_init_method=scaled_init_method_normal(args.init_method_std,
                                                                        args.num_layers),
                     layer_number=layer_idx,
@@ -116,7 +115,7 @@ def _to_16bit(inputs):
             )
 
         # Drop encoded tokens
-        self.specs.append(lambda encoded_tokens, target_tokens: target_tokens)
+        self.specs.append(lambda encoded_and_target: encoded_and_target[1])
 
         # Final layernorm after decoder layers
         self.specs.append(

From 216a3f5f5d961531f2ed4827d213194aab00dc0d Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 27 Jun 2022 23:21:00 +0200
Subject: [PATCH 23/25] Removing erroneous trailing layers

---
 megatron/model/shared_t5_model.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/megatron/model/shared_t5_model.py b/megatron/model/shared_t5_model.py
index c52390988..d944cb746 100644
--- a/megatron/model/shared_t5_model.py
+++ b/megatron/model/shared_t5_model.py
@@ -147,22 +147,6 @@ def _logits_helper(embedding, lm_output):
                           tied_weight_attr='word_embeddings_weight')
         )
 
-        if not hasattr(args, 'attn_mask'):
-            # We drop attention mask from the pipeline
-            self.specs.append(lambda x: x[0])
-
-        # Final layernorm after transformer layers
-        self.specs.append(
-            TiedLayerSpec(
-                "final_layer_norm",
-                LayerNorm,
-                args.hidden_size,
-                eps=args.layernorm_epsilon
-            ))
-
-        # Undo data format change
-        self.specs.append(lambda x: x.transpose(0, 1).contiguous())
-
         # Convert to fp32 if needed
         if args.fp16 or args.bf16:
             self.specs.append(float16_to_fp32)

From f557523eea39a6ebedcdb19a1599c364a2bcf621 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 30 Jun 2022 15:40:39 +0200
Subject: [PATCH 24/25] Add TODO for loading previous checkpoint

---
 tests/test_training.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/test_training.py b/tests/test_training.py
index 539f0044a..1d4e7efc9 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -719,3 +719,10 @@ def test_pretrain_shared_t5_mlm(self):
         tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
         self.assertEqual(len(tensorboard_files), 2, "tensorboard files")
 
+    def test_convert_from_gpt2_to_shared_t5_lm(self):
+        # Test structure:
+        #  - run `pretrain_gpt.py` in order to obtain GPT checkpoint
+        #  - run conversion script in order to obtain a shared t5 checkpoint
+        #  - run `pretrain_shared_t5_with_mlm.py` from that checkpoint
+        raise NotImplementedError("TODO @thomasw21 write script in order to convert gpt2 checkpoint")
+

From e73aa58e2c5aaa8b3fb11288e8c46b68edd38f46 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 4 Jul 2022 15:42:55 +0200
Subject: [PATCH 25/25] WIP: conversion script:

---
 .../convert_gpt_to_shared_t5.py               | 60 +++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 tools/convert_checkpoint/convert_gpt_to_shared_t5.py

diff --git a/tools/convert_checkpoint/convert_gpt_to_shared_t5.py b/tools/convert_checkpoint/convert_gpt_to_shared_t5.py
new file mode 100644
index 000000000..9030cf273
--- /dev/null
+++ b/tools/convert_checkpoint/convert_gpt_to_shared_t5.py
@@ -0,0 +1,60 @@
+import argparse
+import re
+from functools import partial
+from multiprocessing import Pool
+from pathlib import Path
+
+import torch
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--gpt-checkpoint-path", type=Path, required=True)
+    parser.add_argument("--output-shared-t5-path", type=Path, required=True)
+    parser.add_argument("--only-weights", type=bool, action="store_true")
+    parser.add_argument("--num-proc", type=int, default=1)
+    return parser.parse_args()
+
+def get_shared_t5_file(gpt_path_file: Path, output_shared_t5_path: Path) -> Path:
+    """"Given a GPT checkpoint file path, get the equivalent shared T5 checkpoint path"""
+    raise NotImplementedError()
+
+def get_shared_t5_weight_name(gpt_weight_name: str) -> str:
+    """Given a GPT checkpoint weight name, get the equivalent shated T5 checkpoint weight name"""
+    raise NotImplementedError()
+
+def map_gpt_weights_to_shared_t5_weights(filename: Path, output_shared_t5_path: Path):
+    gpt_weights = torch.load(filename)
+
+    shared_t5_filename = get_shared_t5_file(filename, output_shared_t5_path=output_shared_t5_path)
+    shared_t5_weights = {}
+    for name, weight in gpt_weights.items():
+        shared_t5_weight_name = get_shared_t5_weight_name(name)
+        shared_t5_weights[shared_t5_weight_name] = weight
+
+    torch.save(shared_t5_weights, shared_t5_filename)
+
+IS_WEIGHT_REGEX=re.compile(r"layer_[\d]{2}-model_[\d]{2}-model_states.pt")
+def is_weight_file(filename: Path) -> bool:
+    if filename.is_dir():
+        return False
+
+    basename = filename.name
+    return IS_WEIGHT_REGEX.match(basename) is not None
+
+def main():
+    args = get_args()
+
+    weight_files = [filename for filename in args.gpt_checkpoint_path.iterdir() if is_weight_file(filename)]
+    if args.num_proc == 1:
+        for weight_file in weight_files:
+            map_gpt_weights_to_shared_t5_weights(weight_file, output_shared_t5_path=args.output_shared_t5_path)
+    else:
+        with Pool(args.num_proc) as pool:
+            pool.map(
+                partial(map_gpt_weights_to_shared_t5_weights, output_shared_t5_path=args.output_shared_t5_path),
+                weight_files
+            )
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file