wip

joecummings · joecummings · commit 8b9a12a6a1fc · 2023-02-28T11:19:18.000-05:00
diff --git a/torchtext/prototype/generate.py b/torchtext/prototype/generate.py
@@ -104,15 +104,12 @@ def _update_model_kwargs_for_generation(
         self,
         outputs: Dict[str, Any],
         model_kwargs: Dict[str, Any],
-    ) -> MODEL_KWARGS_TYPE:
+    ) -> None:
         """After a forward pass, update model_kwargs for faster decoding. Modified from https://github.com/huggingface/transformers/blob/67d074874d285e616393c65a0e670088e1b6b74a/src/transformers/generation/utils.py#L692.
 
         Args:
             outputs (Dict[str, Any]): LM output.
             model_kwargs (Dict[str, Any]): Model keyword args to be modified for future runs.
-
-        Returns:
-            Modified model_kwargs w/ updated past, token_type_ids, and attention_mask.
         """
         # Update past
         if "past_key_values" in outputs:
@@ -143,8 +140,6 @@ def _update_model_kwargs_for_generation(
                     dim=-1,
                 )
 
-        return model_kwargs
-
     def greedy_search(
         self,
         input_ids: torch.Tensor,
@@ -227,6 +222,8 @@ def beam_search(
         Returns:
             Tensor of the generated sequences.
         """
+        device = input_ids.device
+
         if self.is_encoder_decoder:
             encoder_output_key = "last_hidden_state" if self.is_huggingface_model else "encoder_output"
             encoder_output = model_kwargs["encoder_outputs"][encoder_output_key]
@@ -236,9 +233,6 @@ def update_func(emissions, N, T, prev_step_token_idxs, prev_step_hyp_idxs, prev_
 
             i = T  # Hacky access to the current seq in inputs
 
-            # Copy over the `model_kwargs` in order to modify
-            new_model_kwargs = model_kwargs.copy()
-
             # For first timestep, create previous step token_idxs and model_states
             if timestep == 0:
                 prev_step_token_idxs = [-1]
@@ -273,7 +267,7 @@ def update_func(emissions, N, T, prev_step_token_idxs, prev_step_hyp_idxs, prev_
                     state_sequences = torch.cat(prev_model_state_sequences[start:end], dim=0)
                     token_indices = (
                         torch.Tensor(prev_step_token_idxs[start:end])
-                        .to(dtype=torch.long, device=self.model.device)
+                        .to(dtype=torch.long, device=device)
                         .reshape(num_samples, 1)
                     )
 
@@ -286,23 +280,24 @@ def update_func(emissions, N, T, prev_step_token_idxs, prev_step_hyp_idxs, prev_
                     ), f"state_and_tokens has shape {state_and_tokens.shape} = expected {(num_samples, timestep + 1)}"
                 else:
                     assert len(prev_model_state_sequences) == 1
-                    state_and_tokens = token_indices = prev_model_state_sequences[0]  # dims: [1, 1]
+                    state_and_tokens = token_indices = prev_model_state_sequences[0].expand(num_beams, -1)  # TODO: Make this more robust
+
 
                 # Cleanup -- combine this with the above
                 if self.is_encoder_decoder:
                     # Expand encoder outputs along the batch dimension so that they match the decoder input state's batch size
                     # This is a view-only operation and doesn't copy
-                    new_model_kwargs["encoder_outputs"][encoder_output_key] = encoder_output_for_curr_seq.expand(
-                        num_samples if timestep > 0 else 1, -1, -1
+                    model_kwargs["encoder_outputs"][encoder_output_key] = encoder_output_for_curr_seq.expand(
+                        num_samples if timestep > 0 else num_beams, -1, -1
                     )
 
                 # Preprocess inputs for generation
-                model_inputs = self.model.prepare_inputs_for_generation(token_indices, **new_model_kwargs)
+                model_inputs = self.model.prepare_inputs_for_generation(token_indices, **model_kwargs)
                 if self.is_huggingface_model:
                     model_inputs.update(self._huggingface_model_input_values)
-                    if len(prev_step_hyp_idxs) > 1 and model_inputs["past_key_values"] is not None:
+                    if len(prev_step_hyp_idxs) > 1 and model_kwargs["past"] is not None:
                         model_inputs["past_key_values"] = self.model._reorder_cache(
-                            model_inputs["past_key_values"],
+                            model_kwargs["past"],
                             torch.Tensor(prev_step_hyp_idxs).to(dtype=torch.int32),
                         )
 
@@ -315,32 +310,23 @@ def update_func(emissions, N, T, prev_step_token_idxs, prev_step_hyp_idxs, prev_
 
                 # HF optimizations to reduce overhead in future `forward` calls
                 if self.is_huggingface_model:
-                    new_model_kwargs = self._update_model_kwargs_for_generation(outputs, new_model_kwargs)
+                    self._update_model_kwargs_for_generation(outputs, model_kwargs)
 
                 # Keep track of probabilities over vocab for this pairing
-                # TODO: clean up duplicate code in these branches
-                if timestep == 0:
-                    sample_lm_scores = torch.squeeze(lm_scores[:, -1])
+                # TODO: fix how we track the number here?
+                for i in range(lm_scores.shape[0]):
+                    sample_lm_scores = lm_scores[i, -1]
                     out_probs.append(sample_lm_scores.tolist())
+                    # Keep track of sequence and decoder hidden states
                     model_states.append(
                         create_emitting_model_state(
-                            Seq2SeqModelState(timestep=timestep, sequence=state_and_tokens, lm_scores=sample_lm_scores)
-                        )
-                    )
-                else:
-                    for i in range(num_samples):
-                        sample_lm_scores = lm_scores[i, -1]
-                        out_probs.append(sample_lm_scores.tolist())
-                        # Keep track of sequence and decoder hidden states
-                        model_states.append(
-                            create_emitting_model_state(
-                                Seq2SeqModelState(
-                                    timestep=timestep,
-                                    sequence=state_and_tokens[i].unsqueeze(0),
-                                    lm_scores=sample_lm_scores,
-                                )
+                            Seq2SeqModelState(
+                                timestep=timestep,
+                                sequence=state_and_tokens[i].unsqueeze(0),
+                                lm_scores=sample_lm_scores,
                             )
                         )
+                    )
 
                 start += step