Add cross-attention to output hypotheses

mgaido91 · mgaido91 · commit 2de61607a0df · 2025-12-24T11:55:13.000+01:00
diff --git a/nemo/collections/asr/modules/transformer/transformer_generators.py b/nemo/collections/asr/modules/transformer/transformer_generators.py
@@ -226,6 +226,7 @@ def _forward(
             step_confidence = None
 
         decoder_mems_list = None
+        xatt_scores_list = None
         for i in range(max_generation_length):
 
             if i == 0:
@@ -234,14 +235,20 @@ def _forward(
                 i += tgt_len - 1
                 input_ids = tgt[:, -1:]
 
-            logits, decoder_mems_list, _ = self._one_step_forward(
+            logits, decoder_mems_list, new_xatt_scores_list = self._one_step_forward(
                 input_ids,
                 encoder_hidden_states,
                 encoder_input_mask,
                 decoder_mems_list,
                 i,
                 return_scores=return_beam_scores,
             )
+            if xatt_scores_list is not None:
+                for layer in range(len(xatt_scores_list)):
+                    xatt_scores_list[layer] = torch.cat(
+                        (xatt_scores_list[layer], new_xatt_scores_list[layer]), dim=2)
+            else:
+                xatt_scores_list = new_xatt_scores_list
 
             if self.temperature is None:  # Greedy decoding
                 next_tokens = torch.argmax(logits[:, -1], dim=-1)
@@ -272,7 +279,7 @@ def _forward(
             samples = list(tgt.view(orig_batch_size, self.n_samples, -1))
             tgt = tgt[:: self.n_samples]
 
-        return tgt, samples, step_confidence_tensor
+        return tgt, samples, step_confidence_tensor, xatt_scores_list
 
     def __call__(
         self, decoder_input_ids=None, encoder_hidden_states=None, encoder_input_mask=None, return_beam_scores=False
@@ -284,12 +291,12 @@ def __call__(
             if not return_beam_scores:
                 return results
             else:
-                prefixes, scores, tgt = results
+                prefixes, scores, tgt, xatt_scores_list = results
                 prefixes = prefixes.view(-1, self.beam_size, tgt.size(1)).split(1, dim=0)
                 scores = scores.view(-1, self.beam_size).split(1, dim=0)
                 prefixes = [x.squeeze(0) for x in prefixes]  # each item is [beam, seq_len]
                 scores = [x.squeeze(0) for x in scores]  # each item is [beam,]
-                return prefixes, scores, tgt
+                return prefixes, scores, tgt, xatt_scores_list
 
     def freeze(self) -> None:
         """Freeze weights of embedding, decoder, and classification layers to prevent memory leak."""
@@ -413,7 +420,7 @@ def _forward(
         tgt, batch_size, max_generation_length = self._prepare_for_search(decoder_input_ids, encoder_hidden_states)
 
         # generate initial buffer of beam_size prefixes-hypotheses
-        log_probs, decoder_mems_list, _ = self._one_step_forward(
+        log_probs, decoder_mems_list, xatt_scores_list = self._one_step_forward(
             tgt, encoder_hidden_states, encoder_input_mask, None, 0
         )
         scores, prefixes = torch.topk(log_probs.permute(0, 2, 1), self.beam_size, dim=1)
@@ -434,6 +441,10 @@ def _forward(
         else:
             hidden_size = decoder_mems_list[0].size(2)
 
+        # repeat xattn scores
+        if xatt_scores_list is not None:
+            xatt_scores_list = [xatt_layer.repeat(self.beam_size, 1, 1, 1) for xatt_layer in xatt_scores_list]
+
         # pad_profile tracks finished hypotheses to generate only <pad> tokens
         # if <eos> or <pad> has been generated
         pad_profile = torch.zeros_like(scores).long()
@@ -449,7 +460,7 @@ def _forward(
             pad_mask = pad_profile.repeat(1, self.beam_size)
 
             # generate and score candidates for prefixes continuation
-            log_probs, decoder_mems_list, _ = self._one_step_forward(
+            log_probs, decoder_mems_list, next_xatt_scores_list = self._one_step_forward(
                 prefixes[:, -1:], encoder_hidden_states, encoder_input_mask, decoder_mems_list, i
             )
             scores_i, prefixes_i = torch.topk(log_probs[:, -1, :], self.beam_size, dim=-1)
@@ -478,6 +489,19 @@ def _forward(
             prefixes_ids = indices_i.unsqueeze(2).repeat(1, 1, p_len)
             prefixes = prefixes.gather(1, prefixes_ids).view(-1, p_len)
 
+            # select xatt scores corresponding to chosen hypotheses
+            if next_xatt_scores_list is not None:
+                num_heads = xatt_scores_list[0].shape[1]
+                xatt_indices_i = indices_i.unsqueeze(2).unsqueeze(3).unsqueeze(4).repeat(
+                    1, 1, num_heads, p_len - 1, src_length) // self.beam_size
+                for layer in range(len(next_xatt_scores_list)):
+                    xatt_layer_score_i = torch.cat((xatt_scores_list[layer], next_xatt_scores_list[layer]), dim=2)
+                    xatt_scores_list[layer] = xatt_layer_score_i.view(
+                        -1, self.beam_size, num_heads, p_len - 1, src_length
+                    ).gather(1, xatt_indices_i).view(
+                        -1, num_heads, p_len - 1, src_length
+                    )
+
             # reshuffle cached decoder memory states to restore the order
             # of hypotheses broken after top-k selection
             mems_ids = indices_i.unsqueeze(2).unsqueeze(3).repeat(1, 1, p_len - 1, hidden_size) // self.beam_size
@@ -501,13 +525,22 @@ def _forward(
         # select best performing hypotheses in each element of the batch
         len_penalties = self.compute_len_penalty(prefixes_len, self.len_pen)
         scores = scores / len_penalties
-        best_guesses = (
-            torch.argmax(scores.view(-1, self.beam_size), dim=1, keepdim=True).repeat(1, prefixes.size(1)).unsqueeze(1)
-        )
-        tgt = prefixes.view(batch_size, self.beam_size, -1).gather(1, best_guesses).squeeze(1)
+        best_guesses = torch.argmax(scores.view(-1, self.beam_size), dim=1, keepdim=True)
+        tgt_best_guesses = best_guesses.repeat(1, prefixes.size(1)).unsqueeze(1)
+        tgt = prefixes.view(batch_size, self.beam_size, -1).gather(1, tgt_best_guesses).squeeze(1)
+
+        # select xatt scores for best hypotheses
+        if xatt_scores_list is not None:
+            _, num_heads, tgt_len, src_len = xatt_scores_list[0].shape
+            xatt_best_guesses = best_guesses.unsqueeze(2).unsqueeze(3).unsqueeze(4).repeat(
+                    1, 1, num_heads, tgt_len, src_len)
+            for layer in range(len(xatt_scores_list)):
+                xatt_scores_list[layer] = xatt_scores_list[layer].view(
+                    -1, self.beam_size, num_heads, tgt_len, src_len
+                ).gather(1, xatt_best_guesses).squeeze(1)
 
         if return_beam_scores:
-            return prefixes, scores * len_penalties, tgt
+            return prefixes, scores * len_penalties, tgt, xatt_scores_list
         else:
             return tgt
 
@@ -549,7 +582,7 @@ def _forward(
         batch_fusion_states_candidates_list = []
 
         # generate initial buffer of beam_size prefixes-hypotheses
-        log_probs, decoder_mems_list, _ = self._one_step_forward(
+        log_probs, decoder_mems_list, xatt_scores_list = self._one_step_forward(
             tgt, encoder_hidden_states, encoder_input_mask, None, 0
         )
         # get fusion models scores
@@ -585,6 +618,10 @@ def _forward(
         else:
             hidden_size = decoder_mems_list[0].size(2)
 
+        # repeat xattn scores
+        if xatt_scores_list is not None:
+            xatt_scores_list = [xatt_layer.repeat(self.beam_size, 1, 1, 1) for xatt_layer in xatt_scores_list]
+
         # pad_profile tracks finished hypotheses to generate only <pad> tokens
         # if <eos> or <pad> has been generated
         pad_profile = torch.zeros_like(scores).long()
@@ -600,7 +637,7 @@ def _forward(
             pad_mask = pad_profile.repeat(1, self.beam_size)
 
             # generate and score candidates for prefixes continuation
-            log_probs, decoder_mems_list, _ = self._one_step_forward(
+            log_probs, decoder_mems_list, next_xatt_scores_list = self._one_step_forward(
                 prefixes[:, -1:], encoder_hidden_states, encoder_input_mask, decoder_mems_list, i
             )
             for fusion_model_idx, fusion_model in enumerate(self.fusion_models):
@@ -647,6 +684,19 @@ def _forward(
             prefixes_ids = indices_i.unsqueeze(2).repeat(1, 1, p_len)
             prefixes = prefixes.gather(1, prefixes_ids).view(-1, p_len)
 
+            # select xatt scores corresponding to chosen hypotheses
+            if next_xatt_scores_list is not None:
+                num_heads = xatt_scores_list[0].shape[1]
+                xatt_indices_i = indices_i.unsqueeze(2).unsqueeze(3).unsqueeze(4).repeat(
+                    1, 1, num_heads, p_len - 1, src_length) // self.beam_size
+                for layer in range(len(next_xatt_scores_list)):
+                    xatt_layer_score_i = torch.cat((xatt_scores_list[layer], next_xatt_scores_list[layer]), dim=2)
+                    xatt_scores_list[layer] = xatt_layer_score_i.view(
+                        -1, self.beam_size, num_heads, p_len - 1, src_length
+                    ).gather(1, xatt_indices_i).view(
+                        -1, num_heads, p_len - 1, src_length
+                    )
+
             # reshuffle cached decoder memory states to restore the order
             # of hypotheses broken after top-k selection
             mems_ids = indices_i.unsqueeze(2).unsqueeze(3).repeat(1, 1, p_len - 1, hidden_size) // self.beam_size
@@ -670,13 +720,22 @@ def _forward(
         # select best performing hypotheses in each element of the batch
         len_penalties = self.compute_len_penalty(prefixes_len, self.len_pen)
         scores = scores / len_penalties
-        best_guesses = (
-            torch.argmax(scores.view(-1, self.beam_size), dim=1, keepdim=True).repeat(1, prefixes.size(1)).unsqueeze(1)
-        )
-        tgt = prefixes.view(batch_size, self.beam_size, -1).gather(1, best_guesses).squeeze(1)
+        best_guesses = torch.argmax(scores.view(-1, self.beam_size), dim=1, keepdim=True)
+        tgt_best_guesses = best_guesses.repeat(1, prefixes.size(1)).unsqueeze(1)
+        tgt = prefixes.view(batch_size, self.beam_size, -1).gather(1, tgt_best_guesses).squeeze(1)
+
+        # select xatt scores for best hypotheses
+        if xatt_scores_list is not None:
+            _, num_heads, tgt_len, src_len = xatt_scores_list[0].shape
+            xatt_best_guesses = best_guesses.unsqueeze(2).unsqueeze(3).unsqueeze(4).repeat(
+                1, 1, num_heads, tgt_len, src_len)
+            for layer in range(len(xatt_scores_list)):
+                xatt_scores_list[layer] = xatt_scores_list[layer].view(
+                    -1, self.beam_size, num_heads, tgt_len, src_len
+                ).gather(1, xatt_best_guesses).squeeze(1)
 
         if return_beam_scores:
-            return prefixes, scores * len_penalties, tgt
+            return prefixes, scores * len_penalties, tgt, xatt_scores_list
         else:
             return tgt
 
diff --git a/nemo/collections/asr/parts/submodules/multitask_beam_decoding.py b/nemo/collections/asr/parts/submodules/multitask_beam_decoding.py
@@ -33,7 +33,10 @@
 
 
 def pack_hypotheses(
-    hypotheses: List[Hypothesis], beam_hypotheses: torch.Tensor, scores: List[Optional[float]]
+    hypotheses: List[Hypothesis],
+    beam_hypotheses: torch.Tensor,
+    scores: List[Optional[float]],
+    xatt_scores_list: List[torch.Tensor] = None
 ) -> List[Hypothesis]:
 
     for idx, hyp in enumerate(hypotheses):  # type: Hypothesis
@@ -49,6 +52,9 @@ def pack_hypotheses(
         if hyp.dec_state is not None:
             hyp.dec_state = _states_to_device(hyp.dec_state)
 
+        if xatt_scores_list is not None:
+            hyp.xatt_scores = [xatt_layer[idx] for xatt_layer in xatt_scores_list]
+
     return hypotheses
 
 
@@ -231,7 +237,7 @@ def forward(
             self.transformer_decoder.eval()
             self.log_softmax_module.eval()
 
-            topk_hypotheses, beam_scores, best_hypo = self.beam_search(
+            topk_hypotheses, beam_scores, best_hypo, xatt_scores_list = self.beam_search(
                 encoder_hidden_states=encoder_hidden_states,
                 encoder_input_mask=encoder_input_mask,
                 decoder_input_ids=decoder_input_ids,
@@ -251,11 +257,13 @@ def forward(
             else:
                 beam_scores = [None for _ in range(len(best_hypo))]
                 best_hypo = best_hypo.detach().cpu()
+                if xatt_scores_list is not None:
+                    xatt_scores_list = [xatt_layer.detach().cpu() for xatt_layer in xatt_scores_list]
                 hypotheses = [
                     Hypothesis(score=0.0, y_sequence=[], timestamp=[]) for _ in range(encoder_hidden_states.shape[0])
                 ]
                 # Pack results into Hypotheses
-                packed_result = pack_hypotheses(hypotheses, best_hypo, beam_scores)
+                packed_result = pack_hypotheses(hypotheses, best_hypo, beam_scores, xatt_scores_list)
                 self.format_hypotheses(packed_result, decoder_input_ids)
 
         self.transformer_decoder.train()
diff --git a/nemo/collections/asr/parts/submodules/multitask_greedy_decoding.py b/nemo/collections/asr/parts/submodules/multitask_greedy_decoding.py
@@ -33,6 +33,7 @@ def pack_hypotheses(
     beam_hypotheses: torch.Tensor,
     scores: List[Optional[float]],
     step_confidence: Optional[torch.Tensor] = None,
+    xatt_scores: Optional[List[torch.Tensor]] = None,
 ) -> List[Hypothesis]:
 
     for idx, hyp in enumerate(hypotheses):  # type: Hypothesis
@@ -52,6 +53,9 @@ def pack_hypotheses(
         if hyp.dec_state is not None:
             hyp.dec_state = _states_to_device(hyp.dec_state)
 
+        if xatt_scores is not None:
+            hyp.xatt_scores = [xatt_layer[idx] for xatt_layer in xatt_scores]
+
     return hypotheses
 
 
@@ -192,7 +196,7 @@ def forward(
             self.transformer_decoder.eval()
             self.log_softmax_module.eval()
 
-            best_hypo, topk_hypotheses, step_confidence = self.greedy_search(
+            best_hypo, topk_hypotheses, step_confidence, xatt_scores_list = self.greedy_search(
                 encoder_hidden_states=encoder_hidden_states,
                 encoder_input_mask=encoder_input_mask,
                 decoder_input_ids=decoder_input_ids,
@@ -202,23 +206,32 @@ def forward(
                 topk_hypotheses = [x.detach().cpu() for x in topk_hypotheses]  # each item is [beam, seq_len]
                 beam_scores = [[None] * self.n_samples for _ in topk_hypotheses]  # each item is [beam,]
                 packed_result = []
+                if xatt_scores_list is not None:
+                    xatt_scores_list = [
+                        xatt_layer.view(len(topk_hypotheses), -1, *xatt_layer.shape[1:]).detach().cpu()
+                        for xatt_layer in xatt_scores_list]
                 for i in range(len(topk_hypotheses)):
                     # Pack results into Hypotheses
                     hypotheses = [Hypothesis(score=0.0, y_sequence=[], timestamp=[]) for _ in range(self.n_samples)]
                     self.format_hypotheses(hypotheses, decoder_input_ids)
+                    topk_xatt_scores = None
+                    if xatt_scores_list is not None:
+                        topk_xatt_scores = [xatt_layer[i] for xatt_layer in xatt_scores_list]
                     packed_result.append(
                         NBestHypotheses(
-                            pack_hypotheses(hypotheses, topk_hypotheses[i], beam_scores[i]), step_confidence
+                            pack_hypotheses(
+                                hypotheses, topk_hypotheses[i], beam_scores[i], step_confidence, topk_xatt_scores)
                         )
                     )
             else:
                 beam_scores = [None for _ in range(len(best_hypo))]
                 best_hypo = best_hypo.cpu()
+                xatt_scores_list = [xatt_scores_layer.detach().cpu() for xatt_scores_layer in xatt_scores_list]
                 hypotheses = [
                     Hypothesis(score=0.0, y_sequence=[], timestamp=[]) for _ in range(encoder_hidden_states.shape[0])
                 ]
                 # Pack results into Hypotheses
-                packed_result = pack_hypotheses(hypotheses, best_hypo, beam_scores, step_confidence)
+                packed_result = pack_hypotheses(hypotheses, best_hypo, beam_scores, step_confidence, xatt_scores_list)
                 self.format_hypotheses(packed_result, decoder_input_ids)
 
         self.transformer_decoder.train()
@@ -256,6 +269,8 @@ def format_hypotheses(self, packed_result: List[Hypothesis], decoder_input_ids:
             if pos < -1:
                 hyp.y_sequence = ids[: pos + 1]
                 hyp.token_confidence = hyp.token_confidence[: pos + 1] if hyp.token_confidence is not None else None
+                if hyp.xatt_scores is not None:
+                    hyp.xatt_scores = [xatt_layer[:, : pos + 1, :] for xatt_layer in hyp.xatt_scores]
 
 
 @dataclass
diff --git a/nemo/collections/asr/parts/utils/rnnt_utils.py b/nemo/collections/asr/parts/utils/rnnt_utils.py
@@ -87,6 +87,9 @@ class Hypothesis:
     last_token (Optional): A token or batch of tokens which was predicted in the last step.
 
     last_frame (Optional): Index of the last decoding step hypothesis was updated including blank token prediction.
+
+    xatt_scores (Optional): List of cross-attention scores for each decoder layer. Each element of the list
+        is a Tensor of shape num heads x decoder input len x encoder output len (HxUxT).
     """
 
     score: float
@@ -108,6 +111,7 @@ class Hypothesis:
     last_token: Optional[torch.Tensor] = None
     token_duration: Optional[torch.Tensor] = None
     last_frame: Optional[int] = None
+    xatt_scores: Optional[List[torch.Tensor]] = None
 
     @property
     def non_blank_frame_confidence(self) -> List[float]:
diff --git a/tests/collections/asr/decoding/test_multi_task_decoding.py b/tests/collections/asr/decoding/test_multi_task_decoding.py