pytorch
diff --git a/‎test/integration_tests/prototype/test_models.py‎
Lines changed: 16 additions & 5 deletions b/‎test/integration_tests/prototype/test_models.py‎
Lines changed: 16 additions & 5 deletions
diff --git a/‎torchtext/prototype/generate.py‎
Lines changed: 139 additions & 0 deletions b/‎torchtext/prototype/generate.py‎
Lines changed: 139 additions & 0 deletions
diff --git a/‎torchtext/prototype/models/t5/bundler.py‎
Lines changed: 18 additions & 17 deletions b/‎torchtext/prototype/models/t5/bundler.py‎
Lines changed: 18 additions & 17 deletions
@@ -66,13 +66,22 @@ def _t5_model(self, is_jit, t5_model, expected_asset_name, test_text):
 
         model_input = transform(test_text)
         if model.encoder_only:
-            actual = model(model_input)["encoder_output"]
+            actual = model(encoder_tokens=model_input)["encoder_output"]
+            if not is_jit:
+                self._t5_get_encoder(model, model_input, actual)
         else:
-            actual = model(model_input)["decoder_output"]
+            actual = model(encoder_tokens=model_input)["decoder_output"]
 
         expected = torch.load(expected_asset_path)
         torch.testing.assert_close(actual, expected, atol=1e-04, rtol=2.5e-06)
 
+    def _t5_get_encoder(self, model, model_input, encoder_output):
+        encoder = model.get_encoder()
+        # Need to set the tgt_key_padding_mask to ensure the same results
+        encoder_padding_mask = model_input.eq(model.padding_idx)
+        output_from_get_encoder = encoder(tgt=model_input, tgt_key_padding_mask=encoder_padding_mask)["encoder_output"]
+        assert torch.all(output_from_get_encoder.eq(encoder_output))
+
     @nested_params(["jit", "not_jit"])
     def test_t5_model(self, name) -> None:
         configuration, type = self.model_name.split("_")
@@ -93,7 +102,8 @@ def test_t5_model(self, name) -> None:
     ],
 )
 class TestT5Wrapper(TorchtextTestCase):
-    @parameterized.expand(["jit", "not_jit"])
+    # No longer Torchscriptable
+    @parameterized.expand(["no_jit"])
     def test_t5_wrapper(self, name) -> None:
         configuration = self.configuration
         test_text = ["translate English to French: I want to eat pizza for dinner."]
@@ -113,7 +123,8 @@ def test_t5_wrapper(self, name) -> None:
 
 
 class TestT5WrapperCheckpoint(TorchtextTestCase):
-    @parameterized.expand(["jit", "not_jit"])
+    # No longer Torchscriptable
+    @parameterized.expand(["no_jit"])
     def test_t5_wrapper_checkpoint(self, name) -> None:
         test_text = ["translate English to French: I want to eat pizza for dinner."]
         expected_text = ["Je veux manger de la pizza pour le dîner."]
@@ -127,7 +138,7 @@ def test_t5_wrapper_checkpoint(self, name) -> None:
             padding_idx=0,
         )
         model = T5Wrapper(
-            checkpoint="https://download.pytorch.org/models/text/t5.base.generation.pt",
+            checkpoint="https://download.pytorch.org/models/text/t5.base.generation.v2.pt",
             t5_config=config,
             transform=transform,
             freeze_model=True,
 
@@ -0,0 +1,139 @@
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class GenerationUtil:
+    """Wrapper to provide generation utils for encoder/decoder models and decoder models.
+
+    Example:
+    >>> model = T5_BASE_GENERATION.get_model()
+    >>> generative_model = GenerationUtil(model=model)
+    >>> generative_model.generate(input_ids, num_beams=1, max_len=100)
+
+    The wrapper can work with *any* model as long as it meets the following requirements:
+    1. Is an encoder/decoder or decoder based model.
+    2. Includes a `get_encoder` method (if applicable) and a `prepare_inputs_for_generation` method.
+
+    This means that popular HuggingFace implementation of T5, Bart, and GPT-2 can all be used with these generation utils!
+    >>> from transformers import T5Model
+    >>> model = T5Model.from_pretrained("t5-base")
+    >>> generative_model = GenerationUtil(model=model, is_huggingface_model=True)
+    >>> generative_model.generate(input_ids, num_beams=1, max_len=100)
+
+    More examples can be found in the `notebooks` directory of this repository.
+    """
+    def __init__(self, model: nn.Module, is_encoder_decoder: bool = True, is_huggingface_model: bool = False) -> None:
+        self.model = model
+        self.is_encoder_decoder = is_encoder_decoder
+        self.is_huggingface_model = is_huggingface_model
+
+    def _prepare_decoder_ids_for_generation(
+        self, batch_size: int, pad_idx: int = 0, device: Optional[torch.device] = None, **model_kwargs
+    ):
+        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
+            return model_kwargs.pop("decoder_input_ids")
+        else:
+            return torch.ones((batch_size, 1), dtype=torch.long, device=device) * pad_idx
+
+    def greedy_search(
+        self, input_ids: torch.Tensor, max_len: int, eos_idx: int, pad_idx: Optional[int] = None, **model_kwargs
+    ) -> torch.Tensor:
+        """Greedy search decoding for text generation. Takes the most likely next token every time.
+
+        Inputs:
+            input_ids (Tensor): Text prompt(s) for greedy generation.
+            max_len (int): Max length to generate responses.
+            eos_idx (int): End of sequence index.
+            pad_idx (int): Padding index.
+            **model_kwargs
+        
+        Returns:
+            Batch of sequences decoded by greedy search.
+        """
+        unfinished_sequences = torch.ones((input_ids.shape[0], 1), device=input_ids.device, dtype=torch.long)
+
+        while True:
+            model_inputs = self.model.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            if self.is_huggingface_model:
+                model_inputs["return_dict"] = True
+                model_inputs["output_hidden_states"] = True
+
+            # Get model output
+            outputs = self.model(**model_inputs)
+            output_key = "logits" if self.is_huggingface_model else "decoder_output"
+            decoder_output = outputs[output_key]
+
+            # Calculate probabilities and take the most likely next token
+            probs = F.log_softmax(decoder_output[:, -1], dim=-1)
+            _, next_tokens = torch.topk(probs, 1)
+
+            # For any finished sequences, padding idx should be the last token
+            if eos_idx is not None:
+                if pad_idx is not None:
+                    next_tokens = next_tokens * unfinished_sequences + pad_idx * (1 - unfinished_sequences)
+
+            # Append the next tokens to the previous tokens
+            input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+
+            if eos_idx is not None:
+                unfinished_sequences = unfinished_sequences.mul((next_tokens != eos_idx).long())
+
+            # Stop iterating once all sequences are finished or exceed the max_len
+            if unfinished_sequences.max() == 0 or len(input_ids[0]) >= max_len:
+                break
+
+        return input_ids
+
+    def beam_search(self, input_ids: torch.Tensor, num_beams: int, max_len: Optional[int]) -> torch.Tensor:
+        raise NotImplementedError()
+
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        num_beams: Optional[int] = None,
+        max_len: Optional[int] = None,
+        pad_idx: int = 0,
+        eos_idx: int = 1,
+    ) -> torch.Tensor:
+        """Generation method.
+
+        `num_beams` == 1 or `num_beams` is None -> greedy search
+        `num_beams` > 1 -> beam search
+
+        Args:
+            input_ids (Tensor): Ids of tokenized input tokens. The 'seed' text for generation.
+            num_beams (int): If provided, specifies the number of beams to use in beam search generation.
+            max_len (int): Max length to generate responses.
+            pad_idx (int): Padding index. Defaults to 0.
+            eos_idx (int): End of sequence index. Defaults to 1.
+
+        Returns:
+            Tensor of Tensors containing output sequences as ids.
+
+        `Note`: If one beam is provided or no beams are specified, the generation method will default to greedy search.
+        """
+        model_kwargs = {}
+
+        if self.is_encoder_decoder:
+            encoder = self.model.get_encoder()
+            model_kwargs["encoder_outputs"] = encoder(inputs)
+            inputs = self._prepare_decoder_ids_for_generation(len(inputs), device=inputs.device, **model_kwargs)
+        
+        if max_len is None:
+            # Too hard to try to figure out the exact max_seq_length for each model
+            logger.warning("`max_len` was not specified. Defaulting to 256 tokens.")
+            max_len = 256
+
+        if num_beams == 1 or num_beams is None:
+            return self.greedy_search(inputs, max_len, eos_idx, pad_idx=pad_idx, **model_kwargs)
+        elif num_beams > 1:
+            return self.beam_search(inputs, num_beams, max_len)
+        else:
+            raise ValueError("`num_beams` must be >= 1.")
@@ -176,7 +176,8 @@ def build_model_from_huggingface_ckpt(
 
         t5_model_state_dict = {
             "token_embeddings.weight": hf_weights["shared.weight"],
-            "norm1.weight": hf_weights["encoder.final_layer_norm.weight"],
+            "encoder.token_embeddings.weight": hf_weights["shared.weight"],
+            "encoder.norm.weight": hf_weights["encoder.final_layer_norm.weight"],
             "encoder.layers.0.self_attn.relative_attention_bias.weight": hf_weights[
                 "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"
             ],
@@ -210,7 +211,7 @@ def build_model_from_huggingface_ckpt(
 
         # Convert decoder layers if model is encoder-decoder
         if not config.encoder_only:
-            t5_model_state_dict["norm2.weight"] = hf_weights["decoder.final_layer_norm.weight"]
+            t5_model_state_dict["decoder.norm.weight"] = hf_weights["decoder.final_layer_norm.weight"]
             t5_model_state_dict["decoder.layers.0.self_attn.relative_attention_bias.weight"] = hf_weights[
                 "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"
             ]
@@ -331,7 +332,7 @@ def config(self) -> T5Conf:
     """
 
 T5_BASE_ENCODER = T5Bundle(
-    _path=urljoin(_TEXT_BUCKET, "t5.base.encoder.pt"),
+    _path=urljoin(_TEXT_BUCKET, "t5.base.encoder.v2.pt"),
     _config=T5Conf(encoder_only=True),
     transform=lambda: T5Transform(
         urljoin(_TEXT_BUCKET, "t5_tokenizer_base.model"),
@@ -344,7 +345,7 @@ def config(self) -> T5Conf:
 T5_BASE_ENCODER.__doc__ = ENCODER_DOC.format("BASE", "base")
 
 T5_BASE = T5Bundle(
-    _path=urljoin(_TEXT_BUCKET, "t5.base.pt"),
+    _path=urljoin(_TEXT_BUCKET, "t5.base.v2.pt"),
     _config=T5Conf(encoder_only=False),
     transform=lambda: T5Transform(
         urljoin(_TEXT_BUCKET, "t5_tokenizer_base.model"),
@@ -357,7 +358,7 @@ def config(self) -> T5Conf:
 T5_BASE.__doc__ = MODEL_DOC.format("BASE", "base")
 
 T5_BASE_GENERATION = T5Bundle(
-    _path=urljoin(_TEXT_BUCKET, "t5.base.generation.pt"),
+    _path=urljoin(_TEXT_BUCKET, "t5.base.generation.v2.pt"),
     _config=T5Conf(encoder_only=False, linear_head=True),
     transform=lambda: T5Transform(
         urljoin(_TEXT_BUCKET, "t5_tokenizer_base.model"),
@@ -370,7 +371,7 @@ def config(self) -> T5Conf:
 T5_BASE_GENERATION.__doc__ = GENERATION_DOC.format("BASE", "base")
 
 T5_SMALL_ENCODER = T5Bundle(
-    _path=urljoin(_TEXT_BUCKET, "t5.small.encoder.pt"),
+    _path=urljoin(_TEXT_BUCKET, "t5.small.encoder.v2.pt"),
     _config=T5Conf(
         encoder_only=True,
         embedding_dim=512,
@@ -391,7 +392,7 @@ def config(self) -> T5Conf:
 
 
 T5_SMALL = T5Bundle(
-    _path=urljoin(_TEXT_BUCKET, "t5.small.pt"),
+    _path=urljoin(_TEXT_BUCKET, "t5.small.v2.pt"),
     _config=T5Conf(
         encoder_only=False,
         embedding_dim=512,
@@ -411,7 +412,7 @@ def config(self) -> T5Conf:
 T5_SMALL.__doc__ = MODEL_DOC.format("SMALL", "small")
 
 T5_SMALL_GENERATION = T5Bundle(
-    _path=urljoin(_TEXT_BUCKET, "t5.small.generation.pt"),
+    _path=urljoin(_TEXT_BUCKET, "t5.small.generation.v2.pt"),
     _config=T5Conf(
         encoder_only=False,
         linear_head=True,
@@ -432,7 +433,7 @@ def config(self) -> T5Conf:
 T5_SMALL_GENERATION.__doc__ = GENERATION_DOC.format("SMALL", "small")
 
 T5_LARGE_ENCODER = T5Bundle(
-    _path=urljoin(_TEXT_BUCKET, "t5.large.encoder.pt"),
+    _path=urljoin(_TEXT_BUCKET, "t5.large.encoder.v2.pt"),
     _config=T5Conf(
         encoder_only=True,
         embedding_dim=1024,
@@ -452,7 +453,7 @@ def config(self) -> T5Conf:
 T5_LARGE_ENCODER.__doc__ = ENCODER_DOC.format("LARGE", "large")
 
 T5_LARGE = T5Bundle(
-    _path=urljoin(_TEXT_BUCKET, "t5.large.pt"),
+    _path=urljoin(_TEXT_BUCKET, "t5.large.v2.pt"),
     _config=T5Conf(
         encoder_only=False,
         embedding_dim=1024,
@@ -472,7 +473,7 @@ def config(self) -> T5Conf:
 T5_LARGE.__doc__ = MODEL_DOC.format("LARGE", "large")
 
 T5_LARGE_GENERATION = T5Bundle(
-    _path=urljoin(_TEXT_BUCKET, "t5.large.generation.pt"),
+    _path=urljoin(_TEXT_BUCKET, "t5.large.generation.v2.pt"),
     _config=T5Conf(
         encoder_only=False,
         linear_head=True,
@@ -493,7 +494,7 @@ def config(self) -> T5Conf:
 T5_LARGE_GENERATION.__doc__ = GENERATION_DOC.format("LARGE", "large")
 
 T5_3B_ENCODER = T5Bundle(
-    _path=urljoin(_TEXT_BUCKET, "t5.3b.encoder.pt"),
+    _path=urljoin(_TEXT_BUCKET, "t5.3b.encoder.v2.pt"),
     _config=T5Conf(
         encoder_only=True,
         embedding_dim=1024,
@@ -514,7 +515,7 @@ def config(self) -> T5Conf:
 T5_3B_ENCODER.__doc__ = ENCODER_DOC.format("3B", "3B")
 
 T5_3B = T5Bundle(
-    _path=urljoin(_TEXT_BUCKET, "t5.3b.pt"),
+    _path=urljoin(_TEXT_BUCKET, "t5.3b.v2.pt"),
     _config=T5Conf(
         encoder_only=False,
         embedding_dim=1024,
@@ -535,7 +536,7 @@ def config(self) -> T5Conf:
 T5_3B.__doc__ = MODEL_DOC.format("3B", "3B")
 
 T5_3B_GENERATION = T5Bundle(
-    _path=urljoin(_TEXT_BUCKET, "t5.3b.generation.pt"),
+    _path=urljoin(_TEXT_BUCKET, "t5.3b.generation.v2.pt"),
     _config=T5Conf(
         encoder_only=False,
         linear_head=True,
@@ -557,7 +558,7 @@ def config(self) -> T5Conf:
 T5_3B_GENERATION.__doc__ = GENERATION_DOC.format("3B", "3B")
 
 T5_11B_ENCODER = T5Bundle(
-    _path=urljoin(_TEXT_BUCKET, "t5.11b.encoder.pt"),
+    _path=urljoin(_TEXT_BUCKET, "t5.11b.encoder.v2.pt"),
     _config=T5Conf(
         encoder_only=True,
         embedding_dim=1024,
@@ -578,7 +579,7 @@ def config(self) -> T5Conf:
 T5_11B_ENCODER.__doc__ = ENCODER_DOC.format("11B", "11B")
 
 T5_11B = T5Bundle(
-    _path=urljoin(_TEXT_BUCKET, "t5.11b.pt"),
+    _path=urljoin(_TEXT_BUCKET, "t5.11b.v2.pt"),
     _config=T5Conf(
         encoder_only=False,
         embedding_dim=1024,
@@ -599,7 +600,7 @@ def config(self) -> T5Conf:
 T5_11B.__doc__ = MODEL_DOC.format("11B", "11B")
 
 T5_11B_GENERATION = T5Bundle(
-    _path=urljoin(_TEXT_BUCKET, "t5.11b.generation.pt"),
+    _path=urljoin(_TEXT_BUCKET, "t5.11b.generation.v2.pt"),
     _config=T5Conf(
         encoder_only=False,
         linear_head=True,