bigscience-workshop · janEbert · Dec 12, 2022 · Dec 12, 2022 · Dec 12, 2022 · Dec 12, 2022
diff --git a/megatron/arguments.py b/megatron/arguments.py
@@ -24,7 +24,7 @@
 import torch
 import deepspeed
 
-from megatron.enums import PositionEmbeddingType
+from megatron.enums import PositionEmbeddingType, UL2ModelType
 import megatron
 from megatron.logging import log_levels
 
@@ -49,6 +49,7 @@ def parse_args(extra_args_provider=None, defaults={},
     parser = _add_autoresume_args(parser)
     parser = _add_biencoder_args(parser)
     parser = _add_vit_args(parser)
+    parser = _add_ul2_args(parser)
     parser = _add_logging_args(parser)
     parser = _add_zero_args(parser)
     parser = _add_memoryopt_args(parser)
@@ -310,6 +311,17 @@ def parse_args(extra_args_provider=None, defaults={},
                 )
         args.skip_train_iteration_range = skip_train_iteration_range
 
+    args.ul2_model_type = UL2ModelType(args.ul2_model_type)
+    if (
+            args.ul2_model_type is not UL2ModelType.ENCODER_DECODER
+            and args.decoder_seq_length is not None
+    ):
+        print(
+            f'WARNING: `--decoder_seq_length` is ignored when '
+            f'`--ul2-model-type` is not '
+            f'"{UL2ModelType.ENCODER_DECODER.value}"!'
+        )
+
     if args.use_bnb_optimizer:
         try:
             import bitsandbytes as bnb
@@ -1024,6 +1036,44 @@ def _add_vit_args(parser):
     return parser
 
 
+def _add_ul2_args(parser):
+    group = parser.add_argument_group(title="UL2")
+
+    group.add_argument('--ul2-model-type', type=str, default='ED',
+                       choices=['ED', 'ND', 'CD'],
+                       help='What type of model to use for UL2 pretraining. '
+                       'ED = encoder-decoder; ND = non-causal decoder-only; '
+                       'CD = causal decoder-only')
+    group.add_argument('--ul2-denoiser-ratios', nargs='+', type=float,
+                       default=None,
+                       help='Probability of each denoising objective to be '
+                       'selected. Uniform distribution by default.')
+    group.add_argument('--ul2-denoisers', nargs='+', type=str,
+                       default=['R', 'R', 'S', 'X', 'X', 'X', 'X'],
+                       choices=['R', 'S', 'X'],
+                       help='What type of UL2 denoising objective the other '
+                       'UL2 configurations refer to.')
+    group.add_argument('--ul2-mean-span-lengths', nargs='+', type=float,
+                       default=[3, 8, 0.25, 3, 8, 64, 64],
+                       help='Mean length for sampling span lengths. '
+                       'Numbers < 1 indicate a mean length of the sequence '
+                       'length times that number.')
+    group.add_argument('--ul2-mask-ratios', nargs='+', type=float,
+                       default=[0.15, 0.15, 0.25, 0.5, 0.5, 0.15, 0.5],
+                       help='Ratio of masked token in the full sequence.')
+    group.add_argument('--ul2-r-denoiser-token', type=str, default='[R]',
+                       help='What token to prepend for the UL2 R-denoising '
+                       'objective.')
+    group.add_argument('--ul2-s-denoiser-token', type=str, default='[S]',
+                       help='What token to prepend for the UL2 S-denoising '
+                       'objective.')
+    group.add_argument('--ul2-x-denoiser-token', type=str, default='[X]',
+                       help='What token to prepend for the UL2 X-denoising '
+                       'objective.')
+
+    return parser
+
+
 def _add_zero_args(parser):
     """Text generate arguments."""
 

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
@@ -18,6 +18,7 @@
 #   https://github.com/google-research/albert/blob/master/create_pretraining_data.py
 # with some modifications.
 
+from enum import Enum
 import math
 import os
 import time
@@ -37,8 +38,16 @@
 DSET_TYPE_BERT = 'standard_bert'
 DSET_TYPE_ICT = 'ict'
 DSET_TYPE_T5  = 't5'
+DSET_TYPE_UL2  = 'ul2'
 
-DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5]
+DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5, DSET_TYPE_UL2]
+
+
+class SamplingStyle(Enum):
+    POISSON = 'poisson'
+    GEOMETRIC = 'geometric'
+    UNIFORM = 'uniform'
+    NORMAL = 'normal'
 
 
 def analyze_data_prefix(data_prefix):
@@ -194,9 +203,16 @@ def create_masked_lm_predictions(tokens,
                                  favor_longer_ngram=False,
                                  do_permutation=False,
                                  geometric_dist=False,
-                                 masking_style="bert"):
+                                 masking_style="bert",
+                                 sampling_style=SamplingStyle.POISSON,
+                                 prefix_lm=False):
     """Creates the predictions for the masked LM objective.
     Note: Tokens here are vocab ids and not text tokens."""
+    if not isinstance(sampling_style, SamplingStyle):
+        sampling_style = SamplingStyle(sampling_style)
+    # Backward-compatibility
+    if geometric_dist:
+        sampling_style = SamplingStyle.GEOMETRIC
 
     cand_indexes = []
     # Note(mingdachen): We create a list for recording if the piece is
@@ -235,18 +251,24 @@ def create_masked_lm_predictions(tokens,
                          max(1, int(round(len(tokens) * masked_lm_prob))))
 
     ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64)
-    if not geometric_dist:
+    if sampling_style is SamplingStyle.POISSON:
         # Note(mingdachen):
         # By default, we set the probilities to favor shorter ngram sequences.
         pvals = 1. / np.arange(1, max_ngrams + 1)
         pvals /= pvals.sum(keepdims=True)
         if favor_longer_ngram:
             pvals = pvals[::-1]
+    elif sampling_style is SamplingStyle.NORMAL:
+        normal_mean = (max_ngrams + 1) / 2
 
     ngram_indexes = []
     for idx in range(len(cand_indexes)):
         ngram_index = []
         for n in ngrams:
+            if prefix_lm:
+                last_cand_index_index = min(idx + n - 1, len(cand_indexes) - 1)
+                if cand_indexes[last_cand_index_index][-1] < len(tokens) - 1:
+                    continue
             ngram_index.append(cand_indexes[idx:idx + n])
         ngram_indexes.append(ngram_index)
 
@@ -266,15 +288,25 @@ def create_masked_lm_predictions(tokens,
                 if index in covered_indexes:
                     continue
 
-        if not geometric_dist:
+        if sampling_style is SamplingStyle.POISSON:
             n = np_rng.choice(ngrams[:len(cand_index_set)],
                               p=pvals[:len(cand_index_set)] /
                               pvals[:len(cand_index_set)].sum(keepdims=True))
-        else:
+        elif sampling_style is SamplingStyle.GEOMETRIC:
             # Sampling "n" from the geometric distribution and clipping it to
             # the max_ngrams. Using p=0.2 default from the SpanBERT paper
             # https://arxiv.org/pdf/1907.10529.pdf (Sec 3.1)
             n = min(np_rng.geometric(0.2), max_ngrams)
+        elif sampling_style is SamplingStyle.UNIFORM:
+            n = np_rng.choice(ngrams[:len(cand_index_set)])
+        elif sampling_style is SamplingStyle.NORMAL:
+            n = round(np.clip(
+                np_rng.normal(loc=normal_mean),
+                1,
+                len(cand_index_set),
+            ))
+        else:
+            raise ValueError('unknown sampling style')
 
         index_set = sum(cand_index_set[n - 1], [])
         n -= 1
@@ -522,6 +554,7 @@ def build_dataset(index, name):
         from megatron.data.bert_dataset import BertDataset
         from megatron.data.ict_dataset import ICTDataset
         from megatron.data.t5_dataset import T5Dataset
+        from megatron.data.ul2_dataset import UL2Dataset
         dataset = None
         if splits[index + 1] > splits[index]:
             # Get the pointer to the original doc-idx so we can set it later.
@@ -560,6 +593,24 @@ def build_dataset(index, name):
                     short_seq_prob=short_seq_prob,
                     **kwargs
                 )
+            elif dataset_type == DSET_TYPE_UL2:
+                args = get_args()
+                dataset = UL2Dataset(
+                    indexed_dataset=indexed_dataset,
+                    model_type=args.ul2_model_type,
+                    denoiser_ratios=args.ul2_denoiser_ratios,
+                    denoisers=args.ul2_denoisers,
+                    mean_span_lengths=args.ul2_mean_span_lengths,
+                    mask_ratios=args.ul2_mask_ratios,
+                    denoiser_tokens={
+                        'R': args.ul2_r_denoiser_token,
+                        'S': args.ul2_s_denoiser_token,
+                        'X': args.ul2_x_denoiser_token,
+                    },
+                    max_seq_length_dec=max_seq_length_dec,
+                    short_seq_prob=short_seq_prob,
+                    **kwargs,
+                )
             elif dataset_type == DSET_TYPE_BERT:
                 dataset = BertDataset(
                     indexed_dataset=indexed_dataset,

diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
@@ -26,6 +26,27 @@
     get_samples_mapping
 )
 
+
+class LengthExceededError(ValueError):
+    def __init__(self, msg=None):
+        if msg is None:
+            msg = (
+                'The sequence input became too long. '
+                'Try to increase `--seq-length` or `--encoder-seq-length`.'
+            )
+        super().__init__(msg)
+
+
+class DecoderLengthExceededError(ValueError):
+    def __init__(self, msg=None):
+        if msg is None:
+            msg = (
+                'The sequence input for the decoder became too long. '
+                'Try to increase `--decoder-seq-length`.'
+            )
+        super().__init__(msg)
+
+
 class T5Dataset(torch.utils.data.Dataset):
 
     def __init__(self, name, indexed_dataset, data_prefix,
@@ -157,13 +178,8 @@ def build_training_sample(sample, target_seq_length,
     return train_sample
 
 
-def pad_and_convert_to_numpy(tokens, masked_positions,
-                             masked_labels, pad_id,
-                             max_seq_length, max_seq_length_dec,
-                             masked_spans=None, bos_id=None,
-                             eos_id=None, sentinel_tokens=None):
-    """Pad sequences and convert them to numpy."""
-
+def merge_subsequent_masks(tokens, masked_spans=None, bos_id=None,
+                           eos_id=None, sentinel_tokens=None):
     sentinel_tokens = collections.deque(sentinel_tokens)
     t5_input = []
     (t5_decoder_in, t5_decoder_out) = ([bos_id], [])
@@ -189,6 +205,18 @@ def pad_and_convert_to_numpy(tokens, masked_positions,
 
     # Add the remaining tokens to the t5 input
     t5_input.extend(tokens[start_index:])
+    return t5_input, t5_decoder_in, t5_decoder_out
+
+
+def pad_and_convert_to_numpy(tokens, masked_positions,
+                             masked_labels, pad_id,
+                             max_seq_length, max_seq_length_dec,
+                             masked_spans=None, bos_id=None,
+                             eos_id=None, sentinel_tokens=None):
+    """Pad sequences and convert them to numpy."""
+
+    t5_input, t5_decoder_in, t5_decoder_out = merge_subsequent_masks(
+        tokens, masked_spans, bos_id, eos_id, sentinel_tokens)
 
     # assert (len(t5_input) - len(masked_spans)) + \
     #        (len(t5_decoder_in) - (len(masked_spans) + 1)) == len(tokens)
@@ -198,7 +226,8 @@ def pad_and_convert_to_numpy(tokens, masked_positions,
     # Encoder-side padding mask.
     num_tokens = len(t5_input)
     padding_length = max_seq_length - num_tokens
-    assert padding_length >= 0
+    if padding_length < 0:
+        raise LengthExceededError()
     assert len(masked_positions) == len(masked_labels)
 
     # Tokens..
@@ -208,7 +237,8 @@ def pad_and_convert_to_numpy(tokens, masked_positions,
     # Decoder-side padding mask.
     num_tokens_dec = len(t5_decoder_in)
     padding_length_dec = max_seq_length_dec - num_tokens_dec
-    assert padding_length_dec >= 0
+    if padding_length_dec < 0:
+        raise DecoderLengthExceededError()
     filler_dec = [pad_id] * padding_length_dec
     tokens_dec_in = np.array(t5_decoder_in + filler_dec, dtype=np.int64)