wip

joecummings · joecummings · commit de07b69efb1c · 2023-02-28T11:18:02.000-05:00
diff --git a/test/torchtext_unittest/prototype/test_generate.py b/test/torchtext_unittest/prototype/test_generate.py
@@ -51,12 +51,13 @@ def test_warns_when_no_max_len_provided(self, mock) -> None:
         generation_model = GenerationUtil(self.model)
         generation_model.generate(self.inputs)
         mock.assert_called_with("`max_len` was not specified. Defaulting to 256 tokens.")
-    
-    def test_beam_search(self) -> None:
-        generation_model = GenerationUtil(self.model)
 
-        tokens = generation_model.generate(self.inputs, num_beams=3, max_len=30, beam_size_token=self.model.config.vocab_size)
+    def test_warns_when_mp_with_greedy(self, mock) -> None:
+        pass
 
+    def test_beam_search_with_t5_(self) -> None:
+        generation_model = GenerationUtil(self.model)
+        tokens = generation_model.generate(self.inputs, num_beams=3, max_len=30, beam_size_token=self.model.config.vocab_size)
         generated_text = self.transform.decode(tokens.tolist())
 
         expected_generated_text = [
@@ -70,3 +71,36 @@ def test_beam_search(self) -> None:
         self.assertEqual(generated_text, expected_generated_text)
 
 
+
+    def test_hf_DELETE(self) -> None:
+        from transformers import T5ForConditionalGeneration, T5Tokenizer
+        from torchtext.prototype.generate import GenerationUtil
+
+        t5 = T5ForConditionalGeneration.from_pretrained("t5-base")
+        test_sequence = ["summarize: studies have shown that owning a dog is good for you"]#, "Q: what is the capital of Alaska?"]
+        generative_hf_t5 = GenerationUtil(t5, is_encoder_decoder=True, is_huggingface_model=True)
+        t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")
+        test_sequence_tk = t5_tokenizer(test_sequence, padding=True, return_tensors="pt").input_ids
+        import time
+
+        start = time.time()
+        tokens = generative_hf_t5.generate(
+            test_sequence_tk,
+            max_len=100,
+            pad_idx=t5.config.pad_token_id,
+            num_beams=10,
+           
+           
+        )
+        end = time.time() - start
+        print(t5_tokenizer.batch_decode(tokens, skip_special_tokens=True), end)
+        exit()
+    
+    def test_jit_generate(self) -> None:
+        generation_model = GenerationUtil(self.model)
+        torch.jit.script(generation_model)
+    
+
+    
+    def test_beam_search_speed(self) -> None:
+        pass
diff --git a/torchtext/prototype/generate.py b/torchtext/prototype/generate.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
@@ -12,6 +12,8 @@
     get_obj_from_emitting_model_state,
 )
 
+import logging
+import warnings
 logger = logging.getLogger(__name__)
 
 DEFAULT_MAX_SEQ_LEN = 256
@@ -50,34 +52,52 @@ class GenerationUtils(nn.Module):
     More examples can be found in the `notebooks` directory of this repository.
     """
 
+    _huggingface_model_input_values = {
+        "return_dict": True,
+        "use_cache": True,
+        "output_hidden_states": True
+    }
+
     def __init__(self, model: nn.Module, **kwargs) -> None:
         super().__init__()
         self.model = model
         self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", True)
         self.is_huggingface_model = kwargs.pop("is_huggingface_model", False)
     
-    def _prepare_encoder_decoder_kwargs_for_generation(self, inputs, model_kwargs):
-        """Modified from."""
+    def _prepare_encoder_decoder_kwargs_for_generation(self, inputs: torch.Tensor, model_kwargs: Dict[str, Any]) -> Dict[str, Any]:
+        """Runs encoder and adds to model_kwargs for decoding. Modified from https://github.com/huggingface/transformers/blob/67d074874d285e616393c65a0e670088e1b6b74a/src/transformers/generation/utils.py#L592.
+
+        Args:
+            inputs: (Tensor): Tokenized startings sequence(s).
+            model_kwargs (Dict[str, Any]): Model keyword arguments to be modified for decoding.
+        
+        Returns:
+            Modified model_kwargs with addition of encoded input sequence(s).
+        """
         # Get encoder
         encoder = self.model.get_encoder()
 
-        # Prepare encoder args and encoder kwargs from model kwargs
-        irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
-        encoder_kwargs = {}
-        for argument, value in model_kwargs.items():
-            if not any([argument.startswith(p) for p in irrelevant_prefix]):
-                encoder_kwargs[argument] = value
+        # Create copy of encoder kwargs
+        encoder_kwargs = model_kwargs.copy()
 
         # Forward pass
         if self.is_huggingface_model:
             encoder_kwargs["return_dict"] = True
+        
+        # import pdb
+        # pdb.set_trace()
+        # print(encoder_kwargs.keys())
+        
+        # assert torch.jit.isinstance(encoder_kwargs, Optional[Dict[str, bool]])
+        
         model_kwargs["encoder_outputs"] = encoder(inputs, **encoder_kwargs)
 
         return model_kwargs
 
     def _prepare_decoder_ids_for_generation(
         self, batch_size: int, pad_idx: int = 0, device: Optional[torch.device] = None, model_kwargs: Optional[Dict[str, Any]] = None
     ):
+        """Prepare decoder IDs for generation."""
         if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
             return model_kwargs.pop("decoder_input_ids")
         else:
@@ -87,16 +107,23 @@ def _update_model_kwargs_for_generation(
         self,
         outputs: Dict[str, Any],
         model_kwargs: Dict[str, Any],
-        is_encoder_decoder: bool = False,
     ) -> Dict[str, Any]:
-        """Modified from."""
+        """After a forward pass, update model_kwargs for faster decoding. Modified from https://github.com/huggingface/transformers/blob/67d074874d285e616393c65a0e670088e1b6b74a/src/transformers/generation/utils.py#L692.
+
+        Args:
+            outputs (Dict[str, Any]): LM output.
+            model_kwargs (Dict[str, Any]): Model keyword args to be modified for future runs.
+        
+        Returns:
+            Modified model_kwargs w/ updated past, token_type_ids, and attention_mask.
+        """
         # Update past
         if "past_key_values" in outputs:
-            model_kwargs["past"] = outputs.past_key_values
+            model_kwargs["past"] = outputs["past_key_values"]
         elif "mems" in outputs:
-            model_kwargs["past"] = outputs.mems
+            model_kwargs["past"] = outputs["mems"]
         elif "past_buckets_states" in outputs:
-            model_kwargs["past"] = outputs.past_buckets_states
+            model_kwargs["past"] = outputs["past_buckets_states"]
         else:
             model_kwargs["past"] = None
 
@@ -105,13 +132,19 @@ def _update_model_kwargs_for_generation(
             token_type_ids = model_kwargs["token_type_ids"]
             model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1)
 
-        # Update attention mask
-        if not is_encoder_decoder:
+        if not self.is_encoder_decoder:
             if "attention_mask" in model_kwargs:
                 attention_mask = model_kwargs["attention_mask"]
                 model_kwargs["attention_mask"] = torch.cat(
                     [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
                 )
+        else:
+            if "decoder_attention_mask" in model_kwargs:
+                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
+                model_kwargs["decoder_attention_mask"] = torch.cat(
+                    [decoder_attention_mask, decoder_attention_mask.new_ones((decoder_attention_mask.shape[0], 1))],
+                    dim=-1,
+                )
 
         return model_kwargs
 
@@ -135,9 +168,7 @@ def greedy_search(
         while True:
             model_inputs = self.model.prepare_inputs_for_generation(input_ids, **model_kwargs)
             if self.is_huggingface_model:
-                model_inputs["return_dict"] = True
-                model_inputs["use_cache"] = True
-                model_inputs["output_hidden_states"] = True
+                model_inputs.update(self._huggingface_model_input_values)
 
             # Get model output
             outputs = self.model(**model_inputs)
@@ -177,7 +208,7 @@ def beam_search(
         eos_idx: int,
         num_python_workers: int,
         max_inference_batch_size: int,
-        model_kwargs,
+        model_kwargs: Dict[str, Any],
     ) -> torch.Tensor:
         """Beam search implemented using Flashlight Text (https://github.com/flashlight/text).
 
@@ -260,26 +291,32 @@ def update_func(emissions, N, T, prev_step_token_idxs, prev_step_model_states, t
                         num_samples if timestep > 0 else 1, -1, -1
                     )
 
-                # Forward pass
+                # Preprocess inputs for generation
                 model_inputs = self.model.prepare_inputs_for_generation(state_and_tokens, **new_model_kwargs)
-                print(model_inputs.get("use_cache"), model_inputs.get("past_key_values"))
-
                 if self.is_huggingface_model:
-                    model_inputs["return_dict"] = True
-                    model_inputs["use_cache"] = True
-                    model_inputs["output_hidden_states"] = True
+                    model_inputs.update(self._huggingface_model_input_values)
 
-                print(model_inputs.get("use_cache"), model_inputs.get("past_key_values"))
+                from typing import MappingProxyType
 
+                model_inputs = MappingProxyType(model_inputs)
+                # Forward pass
                 outputs = self.model(**model_inputs)
+
+                # Collect outputs
                 output_key = "logits" if self.is_huggingface_model else "decoder_output"
                 lm_scores = outputs[output_key]
 
                 # HF optimizations to reduce overhead in future `forward` calls
                 if self.is_huggingface_model:
                     new_model_kwargs = self._update_model_kwargs_for_generation(outputs, new_model_kwargs, is_encoder_decoder=self.is_encoder_decoder)
                     if new_model_kwargs["past"] is not None:
-                        new_model_kwargs["past"] = self.model._reorder_cache(new_model_kwargs["past"], torch.Tensor(num_samples).to(dtype=torch.int32, device=self.model.device))
+                        import pdb
+                        pdb.set_trace()
+                        beam_indices += [start for _ in range(num_samples)]
+                        new_model_kwargs["past"] = self.model._reorder_cache(
+                            new_model_kwargs["past"],
+                            torch.Tensor(beam_indices).to(dtype=torch.int32) # I think this is correct?
+                        )
 
                 # Keep track of probabilities over vocab for this pairing
                 # TODO: clean up duplicate code in these branches
@@ -305,7 +342,7 @@ def update_func(emissions, N, T, prev_step_token_idxs, prev_step_model_states, t
                                 )
                             )
                         )
-                    
+                
                 start += step
 
             return out_probs, model_states
@@ -378,6 +415,8 @@ def forward(
         num_python_workers: int = 1,
         max_inference_batch_size: int = 16,
     ):
+        """Calls self.generate() method."""
+        warnings.warn("Forward method simply calls `GenerationUtils.generate()`. Please use generate method directly.")
         return self.generate(
             inputs=inputs,
             num_beams=num_beams,
@@ -391,41 +430,42 @@ def forward(
             max_inference_batch_size=max_inference_batch_size,
         )
 
-
     def generate(
         self,
         inputs: Optional[torch.Tensor] = None,
         num_beams: Optional[int] = None,
         max_length: Optional[int] = None,
         pad_idx: int = 0,
         eos_idx: int = 1,
+        num_python_workers: int = 1,
         beam_threshold: int = 100,
         beam_size_token: Optional[int] = None,
         eos_score: float = -1.0,
-        num_python_workers: int = 1,
         max_inference_batch_size: int = 16,
     ) -> torch.Tensor:
-        """Generation method.
+        """Entrypoint generation method.
 
         Args:
             input_ids (Tensor): Ids of tokenized input tokens. The 'seed' text for generation.
             num_beams (int): If provided, specifies the number of beams to use in beam search generation.
             max_length (int): Max length to generate responses.
             pad_idx (int): Padding index. Defaults to 0.
             eos_idx (int): End-of-sequence index. Defaults to 1.
+            num_python_workers (int): If > 1, using multiprocessing on CPU.
             beam_size_token (int): Vocab size for the beam search algo to evaluate, can typically default to vocab size of the model.
             beam_threshold (int): Threshold before pruning; specific to beam search.
             eos_score (float): Score to input when `eos_idx` is generated; specific to beam search.
+            max_inference_batch_size (int): In beam search, to avoid OOMs, can choose to batch smaller amounts of hypothesis; defaults to 16.
 
         Returns:
             Tensor of Tensors containing output sequences as ids.
 
-        Conditions for generation: \
-            1. `num_beams` == 1 or `num_beams` is None -> greedy search \
+        Conditions for generation:
+            1. `num_beams` == 1 or `num_beams` is None -> greedy search
             2. `num_beams` > 1 -> beam search
         """
         model_kwargs = {}
-
+        
         if self.is_encoder_decoder:
             model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(inputs, model_kwargs)
             inputs = self._prepare_decoder_ids_for_generation(len(inputs), device=inputs.device, model_kwargs=model_kwargs)
@@ -436,6 +476,8 @@ def generate(
             max_length = DEFAULT_MAX_SEQ_LEN
 
         if num_beams == 1 or num_beams is None:
+            if num_python_workers > 1:
+                logger.warning(f"Multiprocessing is not implemented for greedy search.")
             return self.greedy_search(inputs, max_length, eos_idx, pad_idx=pad_idx, model_kwargs=model_kwargs)
         elif num_beams > 1:
             if beam_size_token is None: