bigscience-workshop · Muennighoff · Jul 10, 2022 · Jul 11, 2022 · Jul 11, 2022 · Jul 11, 2022
diff --git a/finetune_t0_non_causal_decoder.py b/finetune_t0_non_causal_decoder.py
@@ -47,6 +47,21 @@ def model_provider(pre_process=True, post_process=True):
     see_memory_usage(f"After Building Model", force=True)
     return model
 
+def visualize_model_inputs(tokens, attention_mask, labels, loss_mask):
+    import os
+    if os.path.exists("batchoutput.json"):
+        return
+    out = {
+        "tokens": tokens[0,:].tolist(),
+        "labels": labels[0,:].tolist(),
+        "attention_mask": attention_mask[0,:].tolist(),
+        "loss_mask": loss_mask[0,:].tolist(),
+    }
+    import json
+    with open('batchoutput.json', 'w') as fp:
+        json.dump(out, fp)
+
+
 def get_batch_pipe(data):
     """
     Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator` & in packed fashion
@@ -83,17 +98,20 @@ def get_batch_pipe(data):
     )
     # Only compute loss over causal target tokens, i.e. ignore input_tokens & padding
     loss_on_targets_only = ~data_c["decoder_is_inputs"][:, 1:]
-    loss_on_non_pad_only = (tokens != tokenizer.pad)
+    loss_on_non_pad_only = (labels != tokenizer.pad)
     loss_mask *= loss_on_targets_only * loss_on_non_pad_only
 
     attention_mask = get_packed_attention_mask(
         # Run non-causal decoder
-        is_causal=False,
-        causal_mask=~(causal_mask.bool()),
+        is_causal=True,
+        causal_mask=~(causal_mask.bool()), # Turn back into tril being ones
         decoder_is_inputs=decoder_is_inputs.bool(),
         segment_ids=segment_ids.long(),
     )
 
+    # Helper script
+    # visualize_model_inputs(tokens, attention_mask, labels, loss_mask)
+
     if args.position_embedding_type not in [PositionEmbeddingType.alibi, PositionEmbeddingType.rotary]:
         raise NotImplementedError("absolute positional embeddings require us to reset position_ids accordingly.")
 

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
@@ -273,7 +273,7 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
 
     if args.deepspeed:
         load_optimizer_states = False if args.no_load_optim else True
-        loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_optimizer_states=load_optimizer_states)
+        loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states)
         if loaded_dir is None:
             print_rank_0('WARNING: could not find the metadata file {} '.format(
                 load_dir))

diff --git a/megatron/model/gpt_model.py b/megatron/model/gpt_model.py
@@ -158,6 +158,16 @@ def load_state_dict(self, state_dict, strict=True):
             state_dict = state_dict[self._language_model_key]
         self.language_model.load_state_dict(state_dict, strict=strict)
 
+def visualize_outputs(losses):
+    import os
+    if os.path.exists("losses.json"):
+        return
+    out = {
+        "losses": losses[0,:].tolist(),
+    }
+    import json
+    with open('losses.json', 'w') as fp:
+        json.dump(out, fp)
 
 def get_cross_entropy(is_prefix: bool):
     def CrossEntropy(output, labels):
@@ -167,6 +177,9 @@ def CrossEntropy(output, labels):
 
         losses = mpu.vocab_parallel_cross_entropy(output.contiguous().float(), labels)
 
+        # Helper script
+        # visualize_outputs(losses)
+
         if is_prefix:
             micro_batch_size, sequence_length = loss_mask.shape
             average_tokens_per_sample: torch.Tensor
@@ -252,7 +265,8 @@ def _to_float16(inputs):
                                                                        args.num_layers),
                     layer_number=layer_idx,
                     # TODO: Change naming of class from GPT to something that encapsulate prefix lm.
-                    self_attn_mask_type=attn_mask_type))
+                    self_attn_mask_type=attn_mask_type)
+                )
 
         # Undo data format change
         def undo(x):

diff --git a/megatron/utils.py b/megatron/utils.py
@@ -261,11 +261,18 @@ def get_packed_attention_mask(is_causal: bool, causal_mask: torch.Tensor, decode
         - segment_ids: torch.IntTensor [batch_size, sequence_length]
     Returns:
         - attention_mask: torch.BoolTensor [batch_size, 1, sequence_length, sequence_length]
+
+    Input example for the mask examples:
+        att_mask_batch = 1
+        seq_length = 7
+        decoder_is_inputs = torch.tensor([[1, 1, 0, 1, 1, 0, 0]])
+        segment_ids = torch.tensor([[1, 1, 1, 2, 2, 2, 0]])
+        causal_mask = torch.tril(torch.ones(att_mask_batch, seq_length, seq_length)).view(att_mask_batch, 1, seq_length, seq_length)
     """
 
     """Causal Inputs Mask:
-    mask = [[[[1, 1, 0, 0, 0, 0, 0],
-            [1, 1, 0, 0, 0, 0, 0],
+    mask = [[[[1, 1, 0, 1, 1, 0, 0],
+            [1, 1, 0, 1, 1, 0, 0],
             [1, 1, 1, 0, 0, 0, 0],
             [1, 1, 1, 1, 1, 0, 0],
             [1, 1, 1, 1, 1, 0, 0],
@@ -299,7 +306,7 @@ def get_packed_attention_mask(is_causal: bool, causal_mask: torch.Tensor, decode
             [0, 0, 0, 1, 1, 1, 0],
             [0, 0, 0, 1, 1, 1, 0],
             [0, 0, 0, 1, 1, 1, 0],
-            [0, 0, 0, 0, 0, 0, 0]]]]
+            [0, 0, 0, 0, 0, 0, 1]]]]
     """
     segment_mask = segment_ids[:, None, :, None] == segment_ids[:, None, None, :]
 
@@ -311,13 +318,22 @@ def get_packed_attention_mask(is_causal: bool, causal_mask: torch.Tensor, decode
             [0, 0, 0, 1, 1, 0, 0],
             [0, 0, 0, 1, 1, 1, 0],
             [0, 0, 0, 0, 0, 0, 0]]]]
+
+    If is_causal=True:
+    mask = [[[[1, 0, 0, 0, 0, 0, 0],
+            [1, 1, 0, 0, 0, 0, 0],
+            [1, 1, 1, 0, 0, 0, 0],
+            [0, 0, 0, 1, 0, 0, 0],
+            [0, 0, 0, 1, 1, 0, 0],
+            [0, 0, 0, 1, 1, 1, 0],
+            [0, 0, 0, 0, 0, 0, 0]]]]
-            [0, 0, 0, 0, 0, 0, 0]]]]
+            [0, 0, 0, 0, 0, 0, 1]]]]
-            [0, 0, 0, 0, 0, 0, 0]]]]
+            [0, 0, 0, 0, 0, 0, 1]]]]
+
     """
-    attention_mask = causal_inputs_mask * padding_mask * segment_mask
 
-    # Convert attention mask to binary:
-    attention_mask = (attention_mask < 0.5)
+    attention_mask = causal_inputs_mask * padding_mask * segment_mask
 
-    return attention_mask
+    # True for places we do not want to attend to
+    return ~attention_mask
 
 def param_size(parameter):
     return parameter.ds_numel if hasattr(parameter, 'ds_id') else parameter.nelement()

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
@@ -82,6 +82,8 @@ def encode(self, json_line):
         ids = {}
         for key in self.args.json_keys:
             text = data[key]
+            if self.args.prepend_space:
+                text = f" {text}"
             doc_ids = []
             for sentence in Encoder.splitter.tokenize(text):
                 sentence_ids = Encoder.tokenizer.tokenize(sentence)
@@ -117,6 +119,8 @@ def get_args():
                        help='Path to the BPE merge file (if necessary).')
     group.add_argument('--append-eod', action='store_true',
                        help='Append an <eod> token to the end of a document.')
+    group.add_argument('--prepend-space', action='store_true',
+                    help='Prepends a space to the beginning of a document')
     group.add_argument("--tokenizer-name-or-path", type=str, default=None,
                        help="Name or path of the huggingface tokenizer.")
     group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,