EOU algorithm improvements

rfejgin · rfejgin · commit 046cc5c0b658 · 2026-03-02T21:58:09.000-08:00
Signed-off-by: Fejgin, Roy &lt;rfejgin@nvidia.com&gt;
diff --git a/nemo/collections/tts/metrics/eou_classifier.py b/nemo/collections/tts/metrics/eou_classifier.py
@@ -15,7 +15,7 @@
     print(result.eou_type, result.trailing_duration)
 """
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from enum import StrEnum
 from typing import Union
 
@@ -35,6 +35,15 @@ class EoUType(StrEnum):
     NOISE = "noise"  # significant trailing region with high energy relative to speech
 
 
+@dataclass
+class TokenSegment:
+    token: str
+    start: float  # seconds
+    end: float  # seconds
+    duration: float  # seconds
+    confidence: float
+
+
 @dataclass
 class EoUClassification:
     eou_type: EoUType
@@ -44,6 +53,10 @@ class EoUClassification:
     trail_rms_ratio: float
     last_token_duration: float
     last_token_confidence: float
+    last_token: str
+    last_token_gap: float  # blank gap (seconds) between last and second-to-last speech token
+    last_two_phoneme_avg_confidence: float  # average confidence of last two alphanumeric tokens
+    token_segments: list[TokenSegment] = field(default_factory=list)
 
 
 class EoUClassifier:
@@ -61,6 +74,7 @@ def __init__(self, model_name: str = "facebook/wav2vec2-base-960h", sr: int = SR
         self.model.eval()
         self.blank_id = self.processor.tokenizer.pad_token_id
         self.vocab = self.processor.tokenizer.get_vocab()
+        self.id_to_token = {v: k for k, v in self.vocab.items()}
 
     def _text_to_tokens(self, text: str) -> list[int]:
         text = text.upper().strip()
@@ -93,32 +107,90 @@ def _find_speech_end(self, audio: np.ndarray, text: str) -> dict:
         aligned_ids = fa_ids[0].numpy()
         scores = torch.exp(fa_scores[0]).numpy()
 
-        speech_end_frame = 0
-        last_token_start = 0
-        last_token_id = -1
-        for i in range(len(aligned_ids) - 1, -1, -1):
-            if aligned_ids[i] != self.blank_id:
-                if last_token_id == -1:
-                    speech_end_frame = i + 1
-                    last_token_id = int(aligned_ids[i])
-                    last_token_start = i
-                elif int(aligned_ids[i]) != last_token_id:
-                    break
-                else:
-                    last_token_start = i
-
-        if last_token_id == -1:
+        segments: list[TokenSegment] = []
+        cur_id = -1
+        seg_start = 0
+        for i, aid in enumerate(aligned_ids):
+            tid = int(aid)
+            if tid == self.blank_id:
+                if cur_id != -1:
+                    seg_scores = scores[seg_start:i]
+                    segments.append(
+                        TokenSegment(
+                            token=self.id_to_token.get(cur_id, f"<id:{cur_id}>"),
+                            start=seg_start * frame_duration,
+                            end=i * frame_duration,
+                            duration=(i - seg_start) * frame_duration,
+                            confidence=float(seg_scores.mean()),
+                        )
+                    )
+                    cur_id = -1
+            elif tid != cur_id:
+                if cur_id != -1:
+                    seg_scores = scores[seg_start:i]
+                    segments.append(
+                        TokenSegment(
+                            token=self.id_to_token.get(cur_id, f"<id:{cur_id}>"),
+                            start=seg_start * frame_duration,
+                            end=i * frame_duration,
+                            duration=(i - seg_start) * frame_duration,
+                            confidence=float(seg_scores.mean()),
+                        )
+                    )
+                cur_id = tid
+                seg_start = i
+            # else: same non-blank token continues
+        if cur_id != -1:
+            seg_scores = scores[seg_start : len(aligned_ids)]
+            segments.append(
+                TokenSegment(
+                    token=self.id_to_token.get(cur_id, f"<id:{cur_id}>"),
+                    start=seg_start * frame_duration,
+                    end=len(aligned_ids) * frame_duration,
+                    duration=(len(aligned_ids) - seg_start) * frame_duration,
+                    confidence=float(seg_scores.mean()),
+                )
+            )
+
+        if not segments:
             return {
                 "speech_end": 0.0,
                 "last_token_duration": 0.0,
                 "last_token_confidence": 0.0,
+                "last_token": "",
+                "last_token_gap": 0.0,
+                "last_two_phoneme_avg_confidence": 0.0,
+                "token_segments": [],
             }
 
-        last_seg_scores = scores[last_token_start:speech_end_frame]
+        last = segments[-1]
+
+        # Skip trailing punctuation/non-letter tokens for cutoff analysis,
+        # since they don't correspond to real speech sounds and get
+        # unreliably short durations from forced alignment.
+        last_speech = last
+        for seg in reversed(segments):
+            if seg.token.isalnum():
+                last_speech = seg
+                break
+
+        last_idx = segments.index(last_speech)
+        if last_idx > 0:
+            last_token_gap = last_speech.start - segments[last_idx - 1].end
+        else:
+            last_token_gap = last_speech.start
+
+        last_two_alnum = [s for s in segments if s.token.isalnum()][-2:]
+        last_two_avg = float(np.mean([s.confidence for s in last_two_alnum]))
+
         return {
-            "speech_end": speech_end_frame * frame_duration,
-            "last_token_duration": (speech_end_frame - last_token_start) * frame_duration,
-            "last_token_confidence": float(last_seg_scores.mean()),
+            "speech_end": last.end,  # + 0.05, # add 50ms of tolerance
+            "last_token_duration": last_speech.duration,
+            "last_token_confidence": last_speech.confidence,
+            "last_token": last_speech.token,
+            "last_token_gap": last_token_gap,
+            "last_two_phoneme_avg_confidence": last_two_avg,
+            "token_segments": segments,
         }
 
     def classify(
@@ -146,8 +218,8 @@ def classify(
 
         speech_end = info["speech_end"]
         trailing = audio_dur - speech_end
-
-        trail_start = int(speech_end * self.sr)
+        last_letter_pad = 0.15
+        trail_start = int((speech_end + last_letter_pad) * self.sr)
         trailing_audio = samples[trail_start:]
         if len(trailing_audio) > 0:
             rms_trail = np.sqrt(np.mean(trailing_audio**2))
@@ -158,15 +230,26 @@ def classify(
 
         last_dur = info["last_token_duration"]
         last_conf = info["last_token_confidence"]
-
-        if trailing < 0.06 and last_dur < 0.025 and last_conf < 0.1:
-            # speech ends abruptly, with a very short last token and low confidence --> cutoff
+        if last_conf < 0.01:
+            last_conf = info["last_two_phoneme_avg_confidence"]
+        last_tok = info["last_token"]
+        last_gap = info["last_token_gap"]
+        last_two_avg = info["last_two_phoneme_avg_confidence"]
+        token_segments = info["token_segments"]
+
+        # if trailing < 0.06 and (last_dur < 0.025 and last_conf < 0.15):
+        # if trailing < 0.06 and (last_gap < 0.1 and last_conf < 0.15): short trail and not due to gap
+        conf_threshold = 0.07
+        # short tail with low confidence and not due to gap (which could indicate noise) --> cutoff
+        if trailing < 0.1 and last_conf < conf_threshold and not last_gap > 0.4:
+            # speech ends abruptly, with a very short last token or low confidence --> cutoff
             eou_type = EoUType.CUTOFF
-        elif trailing > 0.3 and trail_rms_ratio > 0.5:
+        # long noisy tail OR odd gap --> noisy
+        elif (trailing > 0.25 and trail_rms_ratio > 0.3) or (last_gap > 0.4 and last_conf < conf_threshold):
             # significant trailing region with high energy relative to speech --> noise
             eou_type = EoUType.NOISE
-        elif trailing > 1.0 and trail_rms_ratio < 0.10:
-            # very long trailing region with near-zero energy --> silence
+        # very long trailing region with near-zero energy --> silence
+        elif trailing > 1.0:  # and trail_rms_ratio < 0.10:
             eou_type = EoUType.SILENCE
         else:
             # everything else (moderate trailing, natural energy decay) --> good
@@ -180,6 +263,10 @@ def classify(
             trail_rms_ratio=trail_rms_ratio,
             last_token_duration=last_dur,
             last_token_confidence=last_conf,
+            last_token=last_tok,
+            last_token_gap=last_gap,
+            last_two_phoneme_avg_confidence=last_two_avg,
+            token_segments=token_segments,
         )
 
 
@@ -200,3 +287,9 @@ def classify(
     print(f"trail_rms_ratio:    {result.trail_rms_ratio:.4f}")
     print(f"last_token_dur:     {result.last_token_duration:.3f}s")
     print(f"last_token_conf:    {result.last_token_confidence:.3f}")
+    print(f"last_token_gap:     {result.last_token_gap:.3f}s")
+    print(f"last_2_ph_avg_conf: {result.last_two_phoneme_avg_confidence:.3f}")
+    print(f"last_token:         {result.last_token!r}")
+    print(f"\nToken segments ({len(result.token_segments)}):")
+    for seg in result.token_segments:
+        print(f"  {seg.token!r:<6} {seg.start:.3f}-{seg.end:.3f}s  dur={seg.duration:.3f}s  conf={seg.confidence:.3f}")