bigscience-workshop · Muennighoff · Jul 10, 2022 · Jul 11, 2022 · Jul 11, 2022 · Jul 11, 2022
diff --git a/finetune_t0_non_causal_decoder.py b/finetune_t0_non_causal_decoder.py
@@ -89,7 +89,7 @@ def get_batch_pipe(data):
     attention_mask = get_packed_attention_mask(
         # Run non-causal decoder
         is_causal=False,
-        causal_mask=~(causal_mask.bool()),
+        causal_mask=~(causal_mask.bool()), # Turn back into tril being ones
         decoder_is_inputs=decoder_is_inputs.bool(),
         segment_ids=segment_ids.long(),
     )

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
@@ -273,7 +273,7 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
 
     if args.deepspeed:
         load_optimizer_states = False if args.no_load_optim else True
-        loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_optimizer_states=load_optimizer_states)
+        loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states)
         if loaded_dir is None:
             print_rank_0('WARNING: could not find the metadata file {} '.format(
                 load_dir))

diff --git a/megatron/model/gpt_model.py b/megatron/model/gpt_model.py
@@ -252,7 +252,8 @@ def _to_float16(inputs):
                                                                        args.num_layers),
                     layer_number=layer_idx,
                     # TODO: Change naming of class from GPT to something that encapsulate prefix lm.
-                    self_attn_mask_type=attn_mask_type))
+                    self_attn_mask_type=attn_mask_type)
+                )
 
         # Undo data format change
         def undo(x):

diff --git a/megatron/utils.py b/megatron/utils.py
@@ -261,11 +261,18 @@ def get_packed_attention_mask(is_causal: bool, causal_mask: torch.Tensor, decode
         - segment_ids: torch.IntTensor [batch_size, sequence_length]
     Returns:
         - attention_mask: torch.BoolTensor [batch_size, 1, sequence_length, sequence_length]
+
+    Input example for the mask examples:
+        att_mask_batch = 1
+        seq_length = 7
+        decoder_is_inputs = torch.tensor([[1, 1, 0, 1, 1, 0, 0]])
+        segment_ids = torch.tensor([[1, 1, 1, 2, 2, 2, 0]])
+        causal_mask = torch.tril(torch.ones(att_mask_batch, seq_length, seq_length)).view(att_mask_batch, 1, seq_length, seq_length)
     """
 
     """Causal Inputs Mask:
-    mask = [[[[1, 1, 0, 0, 0, 0, 0],
-            [1, 1, 0, 0, 0, 0, 0],
+    mask = [[[[1, 1, 0, 1, 1, 0, 0],
+            [1, 1, 0, 1, 1, 0, 0],
             [1, 1, 1, 0, 0, 0, 0],
             [1, 1, 1, 1, 1, 0, 0],
             [1, 1, 1, 1, 1, 0, 0],
@@ -299,7 +306,7 @@ def get_packed_attention_mask(is_causal: bool, causal_mask: torch.Tensor, decode
             [0, 0, 0, 1, 1, 1, 0],
             [0, 0, 0, 1, 1, 1, 0],
             [0, 0, 0, 1, 1, 1, 0],
-            [0, 0, 0, 0, 0, 0, 0]]]]
+            [0, 0, 0, 0, 0, 0, 1]]]]
     """
     segment_mask = segment_ids[:, None, :, None] == segment_ids[:, None, None, :]
 
@@ -311,13 +318,22 @@ def get_packed_attention_mask(is_causal: bool, causal_mask: torch.Tensor, decode
             [0, 0, 0, 1, 1, 0, 0],
             [0, 0, 0, 1, 1, 1, 0],
             [0, 0, 0, 0, 0, 0, 0]]]]
+
+    If is_causal=True:
+    mask = [[[[1, 0, 0, 0, 0, 0, 0],
+            [1, 1, 0, 0, 0, 0, 0],
+            [1, 1, 1, 0, 0, 0, 0],
+            [0, 0, 0, 1, 0, 0, 0],
+            [0, 0, 0, 1, 1, 0, 0],
+            [0, 0, 0, 1, 1, 1, 0],
+            [0, 0, 0, 0, 0, 0, 0]]]]
-            [0, 0, 0, 0, 0, 0, 0]]]]
+            [0, 0, 0, 0, 0, 0, 1]]]]
-            [0, 0, 0, 0, 0, 0, 0]]]]
+            [0, 0, 0, 0, 0, 0, 1]]]]
+
     """
-    attention_mask = causal_inputs_mask * padding_mask * segment_mask
 
-    # Convert attention mask to binary:
-    attention_mask = (attention_mask < 0.5)
+    attention_mask = causal_inputs_mask * padding_mask * segment_mask
 
-    return attention_mask
+    # True for places we do not want to attend to
+    return ~attention_mask
 
 def param_size(parameter):
     return parameter.ds_numel if hasattr(parameter, 'ds_id') else parameter.nelement()

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
@@ -82,6 +82,8 @@ def encode(self, json_line):
         ids = {}
         for key in self.args.json_keys:
             text = data[key]
+            if self.args.prepend_space:
+                text = " " + text
             doc_ids = []
             for sentence in Encoder.splitter.tokenize(text):
                 sentence_ids = Encoder.tokenizer.tokenize(sentence)
@@ -117,6 +119,8 @@ def get_args():
                        help='Path to the BPE merge file (if necessary).')
     group.add_argument('--append-eod', action='store_true',
                        help='Append an <eod> token to the end of a document.')
+    group.add_argument('--prepend-space', action='store_true',
+                    help='Prepends a space to the beginning of a document')
     group.add_argument("--tokenizer-name-or-path", type=str, default=None,
                        help="Name or path of the huggingface tokenizer.")
     group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,