Add HF update methods

joecummings · joecummings · commit daac585f426a · 2023-02-28T11:17:17.000-05:00
diff --git a/torchtext/prototype/generate.py b/torchtext/prototype/generate.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -26,7 +26,7 @@ class Seq2SeqModelState(object):
     lm_scores: Optional[torch.Tensor]
 
 
-class GenerationUtils:
+class GenerationUtils(nn.Module):
     """Wrapper to provide generation utils for encoder/decoder models and decoder models.
 
     Example:
@@ -51,20 +51,72 @@ class GenerationUtils:
     """
 
     def __init__(self, model: nn.Module, **kwargs) -> None:
+        super().__init__()
         self.model = model
         self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", True)
         self.is_huggingface_model = kwargs.pop("is_huggingface_model", False)
+    
+    def _prepare_encoder_decoder_kwargs_for_generation(self, inputs, model_kwargs):
+        """Modified from."""
+        # Get encoder
+        encoder = self.model.get_encoder()
+
+        # Prepare encoder args and encoder kwargs from model kwargs
+        irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
+        encoder_kwargs = {}
+        for argument, value in model_kwargs.items():
+            if not any([argument.startswith(p) for p in irrelevant_prefix]):
+                encoder_kwargs[argument] = value
+
+        # Forward pass
+        if self.is_huggingface_model:
+            encoder_kwargs["return_dict"] = True
+        model_kwargs["encoder_outputs"] = encoder(inputs, **encoder_kwargs)
+
+        return model_kwargs
 
     def _prepare_decoder_ids_for_generation(
-        self, batch_size: int, pad_idx: int = 0, device: Optional[torch.device] = None, **model_kwargs
+        self, batch_size: int, pad_idx: int = 0, device: Optional[torch.device] = None, model_kwargs: Optional[Dict[str, Any]] = None
     ):
         if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
             return model_kwargs.pop("decoder_input_ids")
         else:
             return torch.ones((batch_size, 1), dtype=torch.long, device=device) * pad_idx
 
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: Dict[str, Any],
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+    ) -> Dict[str, Any]:
+        """Modified from."""
+        # Update past
+        if "past_key_values" in outputs:
+            model_kwargs["past"] = outputs.past_key_values
+        elif "mems" in outputs:
+            model_kwargs["past"] = outputs.mems
+        elif "past_buckets_states" in outputs:
+            model_kwargs["past"] = outputs.past_buckets_states
+        else:
+            model_kwargs["past"] = None
+
+        # Update token_type_ids with last value
+        if "token_type_ids" in model_kwargs:
+            token_type_ids = model_kwargs["token_type_ids"]
+            model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1)
+
+        # Update attention mask
+        if not is_encoder_decoder:
+            if "attention_mask" in model_kwargs:
+                attention_mask = model_kwargs["attention_mask"]
+                model_kwargs["attention_mask"] = torch.cat(
+                    [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+                )
+
+        return model_kwargs
+
     def greedy_search(
-        self, input_ids: torch.Tensor, max_length: int, eos_idx: int, pad_idx: Optional[int] = None, **model_kwargs
+        self, input_ids: torch.Tensor, max_length: int, eos_idx: int, pad_idx: Optional[int] = None, model_kwargs: Optional[Dict[str, Any]] = {}
     ) -> torch.Tensor:
         """Greedy search decoding for text generation. Takes the most likely next token every time.
 
@@ -73,7 +125,7 @@ def greedy_search(
             max_length (int): Max length to generate responses.
             eos_idx (int): End of sequence index.
             pad_idx (int): Padding index.
-            **model_kwargs
+            model_kwargs
 
         Returns:
             Batch of sequences decoded by greedy search.
@@ -123,7 +175,8 @@ def beam_search(
         eos_score: float,
         eos_idx: int,
         num_python_workers: int,
-        **model_kwargs,
+        max_inference_batch_size: int,
+        model_kwargs,
     ) -> torch.Tensor:
         """Beam search implemented using Flashlight Text (https://github.com/flashlight/text).
 
@@ -136,7 +189,7 @@ def beam_search(
             eos_score (float): Score to input when `eos_idx` is generated.
             eos_idx (int): End-of-sequence index.
             num_python_workers (int): Number of python workers to use for multiprocessing.
-            **model_kwargs
+            model_kwargs
         
         Returns:
             Tensor of the generated sequences.
@@ -147,8 +200,9 @@ def beam_search(
 
         def update_func(emissions, N, T, prev_step_token_idxs, prev_step_model_states, timestep):
             # `emissions` and `N` are unused in this current implementation
-            i = T  # Hacky, but access the current seq in inputs
-
+            
+            i = T  # Hacky access to the current seq in inputs
+            
             # Copy over the `model_kwargs` in order to modify
             new_model_kwargs = model_kwargs.copy()
 
@@ -161,31 +215,30 @@ def update_func(emissions, N, T, prev_step_token_idxs, prev_step_model_states, t
                     )
                 ]
 
-            encoder_output_indexed = encoder_output[i, :, :].unsqueeze(0) if self.is_encoder_decoder else None
+            encoder_output_for_curr_seq = encoder_output[i, :, :].unsqueeze(0) if self.is_encoder_decoder else None
             prev_model_state_sequences = [
                 get_obj_from_emitting_model_state(state).sequence for state in prev_step_model_states
             ]
             out_probs, model_states = [], []
 
-            # Batch inference of chunks of elements in the beam
             start = 0
-            # TODO: make this configurable to help people get around OOMs.
             # This is the parallelism level at which elements in the beam will be batched
-            MAX_INFERENCE_BATCH_SIZE = 16
             step = min(
-                MAX_INFERENCE_BATCH_SIZE, 1000 / (timestep + 1)
+                max_inference_batch_size, 1000 / (timestep + 1)
             )  # many hypotheses will EOS, so increase the batch size gradually
-            cur_beam_size = len(prev_step_token_idxs)
-            while start < cur_beam_size:  # catch the remainder
+            curr_beam_size = len(prev_step_token_idxs)
+            
+            # 2. Batched inference to get next tokens
+            while start < curr_beam_size:  # catch the remainder
                 end = start + step
-                if end > cur_beam_size:
-                    end = cur_beam_size
+                if end > curr_beam_size:
+                    end = curr_beam_size
 
-                num_samples = end - start
+                num_samples = end - start # Is this always just gunna be equal to curr_beam_size?
 
                 if prev_step_token_idxs != [-1]:
                     state_sequences = torch.cat(prev_model_state_sequences[start:end], dim=0)
-                    token_indices = torch.Tensor(prev_step_token_idxs[start:end]).to(torch.long).reshape(num_samples, 1)
+                    token_indices = torch.Tensor(prev_step_token_idxs[start:end]).to(dtype=torch.long, device=self.model.device).reshape(num_samples, 1)
 
                     state_and_tokens = torch.cat(
                         [state_sequences, token_indices], dim=-1
@@ -198,15 +251,14 @@ def update_func(emissions, N, T, prev_step_token_idxs, prev_step_model_states, t
                     assert len(prev_model_state_sequences) == 1
                     state_and_tokens = prev_model_state_sequences[0]  # dims: [1, 1]
 
-                start += step
-
                 # Cleanup -- combine this with the above
                 if self.is_encoder_decoder:
                     # Expand encoder outputs along the batch dimension so that they match the decoder input state's batch size
                     # This is a view-only operation and doesn't copy
-                    new_model_kwargs["encoder_outputs"][encoder_output_key] = encoder_output_indexed.expand(
+                    new_model_kwargs["encoder_outputs"][encoder_output_key] = encoder_output_for_curr_seq.expand(
                         num_samples if timestep > 0 else 1, -1, -1
                     )
+
                 # Forward pass
                 model_inputs = self.model.prepare_inputs_for_generation(state_and_tokens, **new_model_kwargs)
 
@@ -218,6 +270,12 @@ def update_func(emissions, N, T, prev_step_token_idxs, prev_step_model_states, t
                 output_key = "logits" if self.is_huggingface_model else "decoder_output"
                 lm_scores = outputs[output_key]
 
+                # HF optimizations to reduce overhead in future `forward` calls
+                if self.is_huggingface_model:
+                    new_model_kwargs = self._update_model_kwargs_for_generation(outputs, new_model_kwargs, is_encoder_decoder=self.is_encoder_decoder)
+                    if new_model_kwargs["past"] is not None:
+                        new_model_kwargs["past"] = self.model._reorder_cache(new_model_kwargs["past"], torch.Tensor(num_samples).to(dtype=torch.int32, device=self.model.device))
+
                 # Keep track of probabilities over vocab for this pairing
                 # TODO: clean up duplicate code in these branches
                 if timestep == 0:
@@ -242,9 +300,12 @@ def update_func(emissions, N, T, prev_step_token_idxs, prev_step_model_states, t
                                 )
                             )
                         )
+                    
+                start += step
 
             return out_probs, model_states
 
+        # 3. Initialize options and decoder from Flashlight Text
         options = LexiconFreeSeq2SeqDecoderOptions(
             beam_size=num_beams,
             beam_size_token=beam_size_token,
@@ -258,14 +319,16 @@ def update_func(emissions, N, T, prev_step_token_idxs, prev_step_model_states, t
             options=options, lm=ZeroLM(), eos_idx=eos_idx, update_func=update_func, max_output_length=max_len
         )
 
-        # Create these as function b/c unnamed functions (lambdas) cause problems w/ MP
-        def select_second_elem_in_tuple(tup: Tuple[List[int], float]) -> float:
-            return tup[1]
+        # 4. Process outputs from beam decoder
+        # TODO: This can definitely be optimized
+        def beam_decode_step(timestep: int) -> torch.Tensor:
+            # Create these as function b/c unnamed functions (lambdas) cause problems w/ MP
+            def select_second_elem_in_tuple(tup: Tuple[List[int], float]) -> float:
+                return tup[1]
 
-        def is_not_neg_one(elem: int) -> bool:
-            return elem != -1
+            def is_not_neg_one(elem: int) -> bool:
+                return elem != -1
 
-        def beam_decode_step(timestep: int) -> torch.Tensor:
             # Decode step takes ptr to encoder emissions, i, and beam size token
             # but actually these aren't currently being used.
             decoder.decode_step(0, timestep, 0)
@@ -292,9 +355,38 @@ def beam_decode_step(timestep: int) -> torch.Tensor:
             logger.warning("Multiprocessing has not yet been implemented.")
 
         all_final_tokens = [beam_decode_step(i) for i in range(len(input_ids))]
-
+        
+        # 5. Return top hypotheses for all input sequences
         return torch.stack(all_final_tokens, dim=0)
 
+
+    def forward(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        num_beams: Optional[int] = None,
+        max_len: Optional[int] = None,
+        pad_idx: int = 0,
+        eos_idx: int = 1,
+        beam_threshold: int = 100,
+        beam_size_token: Optional[int] = None,
+        eos_score: float = -1.0,
+        num_python_workers: int = 1,
+        max_inference_batch_size: int = 16,
+    ):
+        return self.generate(
+            inputs=inputs,
+            num_beams=num_beams,
+            max_len=max_len,
+            pad_idx=pad_idx,
+            eos_idx=eos_idx,
+            beam_threshold=beam_threshold,
+            beam_size_token=beam_size_token,
+            eos_score=eos_score,
+            num_python_workers=num_python_workers,
+            max_inference_batch_size=max_inference_batch_size,
+        )
+
+
     def generate(
         self,
         inputs: Optional[torch.Tensor] = None,
@@ -306,6 +398,7 @@ def generate(
         beam_size_token: Optional[int] = None,
         eos_score: float = -1.0,
         num_python_workers: int = 1,
+        max_inference_batch_size: int = 16,
     ) -> torch.Tensor:
         """Generation method.
 
@@ -329,10 +422,8 @@ def generate(
         model_kwargs = {}
 
         if self.is_encoder_decoder:
-            encoder = self.model.get_encoder()
-            # print("inputs size is", inputs.shape)
-            model_kwargs["encoder_outputs"] = encoder(inputs)
-            inputs = self._prepare_decoder_ids_for_generation(len(inputs), device=inputs.device, **model_kwargs)
+            model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(inputs, model_kwargs)
+            inputs = self._prepare_decoder_ids_for_generation(len(inputs), device=inputs.device, model_kwargs=model_kwargs)
 
         if max_length is None:
             # Too hard to try to figure out the exact max_seq_length for each model
@@ -356,7 +447,8 @@ def generate(
                 eos_score=eos_score,
                 num_python_workers=num_python_workers,
                 eos_idx=eos_idx,
-                **model_kwargs,
+                max_inference_batch_size=max_inference_batch_size,
+                model_kwargs=model_kwargs,
             )
         else:
             raise ValueError("`num_beams` must be >= 1.")