Add unit test for EOU classifier

rfejgin · rfejgin · commit 31454fcf72df · 2026-03-03T18:00:45.000-08:00
Signed-off-by: Fejgin, Roy &lt;rfejgin@nvidia.com&gt;
diff --git a/tests/collections/tts/metrics/test_eou_classifier.py b/tests/collections/tts/metrics/test_eou_classifier.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import librosa
+import numpy as np
+import pytest
+
+from nemo.collections.tts.metrics.eou_classifier import (
+    EoUClassification,
+    EoUClassifier,
+    EoUType,
+    TokenSegment,
+    _ends_with_sibilant,
+)
+
+# ---------------------------------------------------------------------------
+# TODO: Fill in (audio_path, text) pairs per EoU class.
+# Paths are relative to the repo root. Multiple examples per class are supported.
+# ---------------------------------------------------------------------------
+DATA_PATH = "/home/TestData/tts/eou_classifier_unit_test"
+_CLASSIFICATION_CASES: list[tuple[EoUType, str, str]] = [
+    (EoUType.GOOD, f"{DATA_PATH}/rodney.wav", "Yes, it is quite amazing to watch and I love all of it."),
+    (
+        EoUType.CUTOFF,
+        f"{DATA_PATH}/libritts_test_clean_1320_122612_000056_000003.wav",
+        "Having reached within a few yards of the latter, he arose to his feet, silently and slowly.",
+    ),
+    (EoUType.SILENCE, f"{DATA_PATH}/magpie_silence_wood.wav", "w o o d"),
+    (EoUType.NOISE, f"{DATA_PATH}/magpie_noisy_yes.wav", "yes"),
+    # this one starts looping the text at the end, should be detected as noise
+    (
+        EoUType.NOISE,
+        f"{DATA_PATH}/magpie_repeated_tail.wav",
+        "Put them away quick before Andella and Rosalie see them.",
+    ),
+]
+
+
+@pytest.fixture(scope="module")
+def classifier():
+    """Load the Wav2Vec2 model once for the entire test module."""
+    return EoUClassifier()
+
+
+# ── classification tests (one per class) ──────────────────────────────────
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize(
+    "eou_type, audio_path, text", _CLASSIFICATION_CASES, ids=[p for _, p, _ in _CLASSIFICATION_CASES]
+)
+def test_classification_matches_expected_class(classifier, eou_type, audio_path, text):
+    """Each sample should be classified as its expected EoU type."""
+    result = classifier.classify(audio_path, text)
+
+    assert isinstance(result, EoUClassification)
+    assert result.eou_type == eou_type, (
+        f"Expected {eou_type.value!r} but got {result.eou_type.value!r} "
+        f"(trailing={result.trailing_duration:.3f}s, rms_ratio={result.trail_rms_ratio:.4f}, "
+        f"last_conf={result.last_token_confidence:.3f})"
+    )
+
+
+# ── numpy array input ─────────────────────────────────────────────────────
+
+
+@pytest.mark.unit
+def test_classify_accepts_numpy_array(classifier):
+    """Classifier should accept a pre-loaded numpy array instead of a path."""
+    _, audio_path, text = next(c for c in _CLASSIFICATION_CASES if c[0] == EoUType.GOOD)
+    samples, _ = librosa.load(audio_path, sr=16000)
+
+    result_from_path = classifier.classify(audio_path, text)
+    result_from_array = classifier.classify(samples, text)
+
+    assert result_from_path.eou_type == result_from_array.eou_type
+    assert abs(result_from_path.trailing_duration - result_from_array.trailing_duration) < 1e-4
+
+
+# ── return value structure ────────────────────────────────────────────────
+
+
+@pytest.mark.unit
+def test_classification_result_structure(classifier):
+    """Verify the returned dataclass fields have correct types and reasonable ranges."""
+    _, audio_path, text = next(c for c in _CLASSIFICATION_CASES if c[0] == EoUType.GOOD)
+    result = classifier.classify(audio_path, text)
+
+    assert isinstance(result.eou_type, EoUType)
+    assert result.speech_end >= 0.0
+    assert result.audio_duration > 0.0
+    assert result.trailing_duration >= 0.0
+    assert result.speech_end <= result.audio_duration + 0.5  # small tolerance for frame rounding
+    assert 0.0 <= result.trail_rms_ratio
+    assert result.last_token_duration >= 0.0
+    assert 0.0 <= result.last_token_confidence <= 1.0
+    assert isinstance(result.last_token, str)
+    assert result.last_token_gap >= 0.0
+    assert 0.0 <= result.last_two_phoneme_avg_confidence <= 1.0
+
+    assert isinstance(result.token_segments, list)
+    assert len(result.token_segments) > 0
+    for seg in result.token_segments:
+        assert isinstance(seg, TokenSegment)
+        assert seg.end >= seg.start
+        assert seg.duration >= 0.0
+        assert 0.0 <= seg.confidence <= 1.0