Fix word confidence return (#15249)

nithinraok · web-flow · commit c6bfb2718264 · 2026-01-08T07:57:37.000-05:00
diff --git a/nemo/collections/asr/parts/utils/asr_confidence_utils.py b/nemo/collections/asr/parts/utils/asr_confidence_utils.py
@@ -22,7 +22,6 @@
 from omegaconf import DictConfig, OmegaConf
 
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
-from nemo.utils import logging
 
 
 class ConfidenceMethodConstants:
@@ -447,7 +446,7 @@ def _aggregate_token_confidence_subwords_sentencepiece(
             prev_underline = False
             for i, token_id in enumerate(token_ids):
                 token = self.decode_ids_to_tokens([int(token_id)])[0]
-                token_text = self.decode_tokens_to_str([int(token_id)])
+                token_text = self.decode_ids_to_str([int(token_id)])
                 # treat `<unk>` as a separate word regardless of the next token
                 # to match the result of `tokenizer.ids_to_text`
                 if (token != token_text or prev_unk) and i > j:
diff --git a/nemo/collections/asr/parts/utils/chunking_utils.py b/nemo/collections/asr/parts/utils/chunking_utils.py
@@ -70,6 +70,7 @@ def merge_parallel_chunks(hypotheses, encoded_len, model, timestamps, subsamplin
         timestamp=([] if not timestamps else {'word': [], 'segment': []}),
     )
     merged_hypotheses = join_y_sequence(merged_hypotheses, hypotheses)
+    merged_hypotheses = join_confidence_values(merged_hypotheses, hypotheses)
     merged_hypotheses.text = final_text
     # Merge timestamps and add word and segment level timestamps
     if timestamps:
@@ -99,6 +100,44 @@ def join_y_sequence(merged_hypothesis, hypotheses):
     return merged_hypothesis
 
 
+def join_confidence_values(merged_hypothesis, hypotheses):
+    """
+    Concatenate confidence values from multiple hypotheses into a single sequence.
+
+    Args:
+        merged_hypothesis: Target hypothesis to update with concatenated confidence
+        hypotheses: List of hypotheses containing confidence values
+
+    Returns:
+        Hypothesis: Updated merged_hypothesis with concatenated confidence values
+    """
+    # Merge frame_confidence
+    frame_confidences = [h.frame_confidence for h in hypotheses if h.frame_confidence is not None]
+    if frame_confidences:
+        if isinstance(frame_confidences[0], torch.Tensor):
+            merged_hypothesis.frame_confidence = torch.cat(frame_confidences)
+        elif isinstance(frame_confidences[0], list):
+            merged_hypothesis.frame_confidence = [c for conf_list in frame_confidences for c in conf_list]
+
+    # Merge token_confidence
+    token_confidences = [h.token_confidence for h in hypotheses if h.token_confidence is not None]
+    if token_confidences:
+        if isinstance(token_confidences[0], torch.Tensor):
+            merged_hypothesis.token_confidence = torch.cat(token_confidences)
+        elif isinstance(token_confidences[0], list):
+            merged_hypothesis.token_confidence = [c for conf_list in token_confidences for c in conf_list]
+
+    # Merge word_confidence
+    word_confidences = [h.word_confidence for h in hypotheses if h.word_confidence is not None]
+    if word_confidences:
+        if isinstance(word_confidences[0], torch.Tensor):
+            merged_hypothesis.word_confidence = torch.cat(word_confidences)
+        elif isinstance(word_confidences[0], list):
+            merged_hypothesis.word_confidence = [c for conf_list in word_confidences for c in conf_list]
+
+    return merged_hypothesis
+
+
 def join_timestamp_and_add_word_and_segment_level_timestamps(
     merged_hypotheses, hypotheses, chunk_offsets, subsampling_factor, window_stride, decoding, merged_tokens=None
 ):
@@ -307,6 +346,9 @@ def merge_hypotheses_of_same_audio(hypotheses_list, timestamps, subsampling_fact
 
     merged_hypothesis.y_sequence = torch.cat([h.y_sequence for h in hypotheses_list])
 
+    # Merge confidence values from all hypotheses
+    merged_hypothesis = join_confidence_values(merged_hypothesis, hypotheses_list)
+
     # Create final text by joining text from all hypotheses
     text_parts = []
     for hyp in hypotheses_list: