Batch decoder inference

jacobkahn · joecummings · commit 9afc8105bf5c · 2023-02-28T11:16:29.000-05:00
diff --git a/torchtext/prototype/generate.py b/torchtext/prototype/generate.py
@@ -4,7 +4,13 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
-from flashlight.lib.text.decoder import LexiconFreeSeq2SeqDecoder, LexiconFreeSeq2SeqDecoderOptions, ZeroLM, create_emitting_model_state, get_obj_from_emitting_model_state
+from flashlight.lib.text.decoder import (
+    LexiconFreeSeq2SeqDecoder,
+    LexiconFreeSeq2SeqDecoderOptions,
+    ZeroLM,
+    create_emitting_model_state,
+    get_obj_from_emitting_model_state,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -108,7 +114,7 @@ def greedy_search(
         return input_ids
 
     def beam_search(
-        self, 
+        self,
         input_ids: torch.Tensor,
         num_beams: int,
         max_len: int,
@@ -117,7 +123,7 @@ def beam_search(
         eos_score: float,
         eos_idx: int,
         num_python_workers: int,
-        **model_kwargs
+        **model_kwargs,
     ) -> torch.Tensor:
         """Beam search implemented using Flashlight Text (https://github.com/flashlight/text).
 
@@ -145,40 +151,65 @@ def update_func(emissions, N, T, prev_step_token_idxs, prev_step_model_states, t
 
             # Copy over the `model_kwargs` in order to modify
             new_model_kwargs = model_kwargs.copy()
-            
+
             # For first timestep, create previous step token_idxs and model_states
             if timestep == 0:
                 prev_step_token_idxs = [-1]
                 prev_step_model_states = [
                     create_emitting_model_state(
-                        Seq2SeqModelState(
-                            timestep=0,
-                            sequence=input_ids[i].unsqueeze(0),
-                            lm_scores=None
-                        )
+                        Seq2SeqModelState(timestep=0, sequence=input_ids[i].unsqueeze(0), lm_scores=None)
                     )
                 ]
-            
-            if self.is_encoder_decoder:
-                # Get the correct encoded seq from the full `encoder_output`` and put it in the correct format
-                new_model_kwargs["encoder_outputs"][encoder_output_key] = encoder_output[i, :, :].unsqueeze(0)
 
+            encoder_output_indexed = encoder_output[i, :, :].unsqueeze(0) if self.is_encoder_decoder else None
+            prev_model_state_sequences = [
+                get_obj_from_emitting_model_state(state).sequence for state in prev_step_model_states
+            ]
             out_probs, model_states = [], []
-            for idx, model_state_ptr in zip(prev_step_token_idxs, prev_step_model_states):
-                # Convert `idx` into a Tensor b/c it's always returned as a native python `int`
-                idx = torch.Tensor([idx]).to(torch.long)
-
-                # Get previous model state
-                prev_model_state = get_obj_from_emitting_model_state(model_state_ptr)
-                
-                # Create new decoder token ids
-                if idx != -1:
-                    new_input_ids = torch.cat([prev_model_state.sequence, idx.unsqueeze(0)], dim=-1)
+
+            # Batch inference of chunks of elements in the beam
+            start = 0
+            # TODO: make this configurable to help people get around OOMs.
+            # This is the parallelism level at which elements in the beam will be batched
+            MAX_INFERENCE_BATCH_SIZE = 16
+            step = min(
+                MAX_INFERENCE_BATCH_SIZE, 1000 / (timestep + 1)
+            )  # many hypotheses will EOS, so increase the batch size gradually
+            cur_beam_size = len(prev_step_token_idxs)
+            while start < cur_beam_size:  # catch the remainder
+                end = start + step
+                if end > cur_beam_size:
+                    end = cur_beam_size
+
+                num_samples = end - start
+
+                if prev_step_token_idxs != [-1]:
+                    state_sequences = torch.cat(prev_model_state_sequences[start:end], dim=0)
+                    token_indices = torch.Tensor(prev_step_token_idxs[start:end]).to(torch.long).reshape(num_samples, 1)
+
+                    state_and_tokens = torch.cat(
+                        [state_sequences, token_indices], dim=-1
+                    )  # [batch_size x (timestep + 1)]
+                    assert state_and_tokens.shape == (
+                        num_samples,
+                        timestep + 1,
+                    ), f"state_and_tokens has shape {state_and_tokens.shape} = expected {(num_samples, timestep + 1)}"
                 else:
-                    new_input_ids = prev_model_state.sequence
-                
+                    assert len(prev_model_state_sequences) == 1
+                    state_and_tokens = prev_model_state_sequences[0]  # dims: [1, 1]
+
+                start += step
+
+                # Cleanup -- combine this with the above
+                if self.is_encoder_decoder:
+                    # Expand encoder outputs along the batch dimension so that they match the decoder input state's batch size
+                    # This is a view-only operation and doesn't copy
+                    new_model_kwargs["encoder_outputs"][encoder_output_key] = encoder_output_indexed.expand(
+                        num_samples if timestep > 0 else 1, -1, -1
+                    )
                 # Forward pass
-                model_inputs = self.model.prepare_inputs_for_generation(new_input_ids, **new_model_kwargs)
+                model_inputs = self.model.prepare_inputs_for_generation(state_and_tokens, **new_model_kwargs)
+
                 if self.is_huggingface_model:
                     model_inputs["return_dict"] = True
                     model_inputs["output_hidden_states"] = True
@@ -188,18 +219,29 @@ def update_func(emissions, N, T, prev_step_token_idxs, prev_step_model_states, t
                 lm_scores = outputs[output_key]
 
                 # Keep track of probabilities over vocab for this pairing
-                out_probs.append(torch.squeeze(lm_scores[:, -1]).tolist())
-                
-                # Keep track of sequence and decoder hidden states
-                model_states.append(
-                    create_emitting_model_state(
-                        Seq2SeqModelState(
-                            timestep=timestep,
-                            sequence=new_input_ids,
-                            lm_scores=lm_scores
+                # TODO: clean up duplicate code in these branches
+                if timestep == 0:
+                    sample_lm_scores = torch.squeeze(lm_scores[:, -1])
+                    out_probs.append(sample_lm_scores.tolist())
+                    model_states.append(
+                        create_emitting_model_state(
+                            Seq2SeqModelState(timestep=timestep, sequence=state_and_tokens, lm_scores=sample_lm_scores)
                         )
                     )
-                )
+                else:
+                    for i in range(num_samples):
+                        sample_lm_scores = lm_scores[i, -1]
+                        out_probs.append(sample_lm_scores.tolist())
+                        # Keep track of sequence and decoder hidden states
+                        model_states.append(
+                            create_emitting_model_state(
+                                Seq2SeqModelState(
+                                    timestep=timestep,
+                                    sequence=state_and_tokens[i].unsqueeze(0),
+                                    lm_scores=sample_lm_scores,
+                                )
+                            )
+                        )
 
             return out_probs, model_states
 
@@ -213,17 +255,13 @@ def update_func(emissions, N, T, prev_step_token_idxs, prev_step_model_states, t
         )
 
         decoder = LexiconFreeSeq2SeqDecoder(
-            options=options,
-            lm=ZeroLM(),
-            eos_idx=eos_idx,
-            update_func=update_func,
-            max_output_length=max_len
+            options=options, lm=ZeroLM(), eos_idx=eos_idx, update_func=update_func, max_output_length=max_len
         )
 
         # Create these as function b/c unnamed functions (lambdas) cause problems w/ MP
         def select_second_elem_in_tuple(tup: Tuple[List[int], float]) -> float:
             return tup[1]
-        
+
         def is_not_neg_one(elem: int) -> bool:
             return elem != -1
 
@@ -235,12 +273,7 @@ def beam_decode_step(timestep: int) -> torch.Tensor:
 
             # Find the best beam
             token_scores = [(hyp.tokens, hyp.score) for hyp in hyps]
-            final_tokens = list(
-                filter(
-                    is_not_neg_one, 
-                    max(token_scores, key=select_second_elem_in_tuple)[0]
-                )
-            )
+            final_tokens = list(filter(is_not_neg_one, max(token_scores, key=select_second_elem_in_tuple)[0]))
 
             # Hack, but have to prepend the input tokens if decoder-only model
             if not self.is_encoder_decoder:
@@ -249,15 +282,15 @@ def beam_decode_step(timestep: int) -> torch.Tensor:
             # Makeshift padding so that we can stack the tensors
             while len(final_tokens) < max_len:
                 final_tokens += [0]
-            
+
             # Convert from list to tensors
             final_tokens_as_tensors = torch.Tensor(final_tokens).to(torch.long)
 
             return final_tokens_as_tensors
 
         if num_python_workers > 1:
             logger.warning("Multiprocessing has not yet been implemented.")
-            
+
         all_final_tokens = [beam_decode_step(i) for i in range(len(input_ids))]
 
         return torch.stack(all_final_tokens, dim=0)
@@ -297,6 +330,7 @@ def generate(
 
         if self.is_encoder_decoder:
             encoder = self.model.get_encoder()
+            # print("inputs size is", inputs.shape)
             model_kwargs["encoder_outputs"] = encoder(inputs)
             inputs = self._prepare_decoder_ids_for_generation(len(inputs), device=inputs.device, **model_kwargs)
 
@@ -309,7 +343,8 @@ def generate(
             return self.greedy_search(inputs, max_length, eos_idx, pad_idx=pad_idx, **model_kwargs)
         elif num_beams > 1:
             if beam_size_token is None:
-                raise ValueError("`beam_size_token` must be specified for beam search. \
+                raise ValueError(
+                    "`beam_size_token` must be specified for beam search. \
                     If confused about what to put, you can default to the vocab size of the model you are using."
                 )
             return self.beam_search(
@@ -321,7 +356,7 @@ def generate(
                 eos_score=eos_score,
                 num_python_workers=num_python_workers,
                 eos_idx=eos_idx,
-                **model_kwargs
+                **model_kwargs,
             )
         else:
             raise ValueError("`num_beams` must be >= 1.")