utilise the Silero VAD model on PyTorch Hub

baxtree · baxtree · commit 10247495e538 · 2026-01-26T09:28:32.000Z
diff --git a/pyproject.toml b/pyproject.toml
@@ -84,6 +84,7 @@ dependencies = [
     "tf-keras~=2.19.0; python_version >= '3.12'",
     "tensorflow>=1.15.5,<2.16.0; python_version < '3.12'",
     "tensorflow~=2.19.0; python_version >= '3.12'",
+    "tensorflow-metal~=1.2.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
     "termcolor==1.1.0",
     "toml==0.10.0",
     "toolz==0.9.0",
@@ -106,7 +107,6 @@ harmony = [
     "accelerate~=1.12.0",
     "soxr==1.0.0",
     "webrtcvad~=2.0.10",
-    "silero-vad~=6.2.0",
 ]
 dev = [
     "aeneas~=1.7.3.0; python_version < '3.12'",
@@ -120,7 +120,6 @@ dev = [
     "soxr==1.0.0",
     "accelerate~=1.12.0",
     "webrtcvad~=2.0.10",
-    "silero-vad~=6.2.0",
     "mock==4.0.3",
     "coverage==5.5",
     "tox~=3.23.0",
@@ -159,7 +158,6 @@ llm = [
     "accelerate~=1.12.0",
     "soxr==1.0.0",
     "webrtcvad~=2.0.10",
-    "silero-vad~=6.2.0",
 ]
 
 [project.scripts]
diff --git a/subaligner/transcriber.py b/subaligner/transcriber.py
@@ -73,6 +73,7 @@ def __init__(self, recipe: str = TranscriptionRecipe.WHISPER.value, flavour: str
         self.__flavour = flavour
         self.__media_helper = MediaHelper()
         self.__lock = Lock()
+        self.vad_model: Optional[Any] = None
 
         if recipe == TranscriptionRecipe.WHISPER.value:
             if flavour not in [f.value for f in WhisperFlavour]:
@@ -143,7 +144,9 @@ def transcribe(self,
                 self.__LOGGER.debug("Prompting with: '%s'" % initial_prompt)
 
                 audio, sr = self.__load_audio(audio_file_path, target_sample_rate=sample_rate)
-                segments = Utils.vad_segment(audio, sample_rate=sr, recipe="silero")
+                segments, self.vad_model = Utils.vad_segment(
+                    audio, sample_rate=sr, recipe="silero", model_local=self.vad_model
+                )
                 self.__LOGGER.info("Segments detected with voice activities")
 
                 final_segments = []
diff --git a/subaligner/utils.py b/subaligner/utils.py
@@ -778,7 +778,8 @@ def vad_segment(audio: np.ndarray,
                     frame_ms: int = 30,
                     aggressiveness: int = 3,
                     min_speech_ms: int = 200,
-                    recipe: str = "webrtcvad") -> List[Tuple[int, int]]:
+                    recipe: str = "webrtcvad",
+                    model_local: Optional[Any] = None) -> Tuple[List[Tuple[int, int]], Any]:
         """Segment audio into speech and non-speech segments using WebRTC VAD.
 
         Arguments:
@@ -788,9 +789,9 @@ def vad_segment(audio: np.ndarray,
             aggressiveness {int} -- The aggressiveness of the VAD (0-3).
             min_speech_ms {int} -- The minimum duration of a speech segment in milliseconds.
             recipe {str} -- The VAD recipe to use ("webrtcvad" or "silero").
-
+            model_local {Optional[Any]} -- The loaded VAD model.
         Returns:
-            List[Tuple[int, int]]: A list of tuples representing the start and end samples of speech segments.
+            Tuple[List[Tuple[int, int]], Any]: A list of tuples representing the start and end samples of speech segments, and the loaded VAD model.
 
         Raises:
             ValueError: If an unsupported VAD recipe is provided.
@@ -831,20 +832,25 @@ def vad_segment(audio: np.ndarray,
             if cur_start is not None and cur_end is not None:
                 if (cur_end - cur_start) >= int(min_speech_ms * sample_rate / 1000):
                     segments.append((cur_start, cur_end))
-            return segments
+            return segments, model_local
         elif recipe == "silero":
-            from silero_vad import load_silero_vad, get_speech_timestamps
-            model = load_silero_vad()
+            if model_local is None:
+                model_local, utils = torch.hub.load(
+                    repo_or_dir="snakers4/silero-vad:be95df9152c0d7618fa1edfeb296fc3dae32376f",  # v6.2
+                    model="silero_vad",
+                    force_reload=False,
+                )
+            (get_speech_timestamps, _, read_audio, *_) = utils
             speech_timestamps = get_speech_timestamps(
                 torch.tensor(audio, dtype=torch.float32),
-                model,
+                model_local,
                 sampling_rate=sample_rate,
                 return_seconds=True,
             )
             segments = []
             for ts in speech_timestamps:
                 segments.append((int(ts['start'] * sample_rate), int(ts['end'] * sample_rate)))
-            return segments
+            return segments, model_local
         else:
             raise ValueError("Unsupported VAD recipe: {}".format(recipe))
 
diff --git a/tests/subaligner/test_utils.py b/tests/subaligner/test_utils.py
@@ -427,7 +427,7 @@ def test_vad_segment_webrtcvad(self):
             audio = audio.astype("float32") / maxv
         audio = audio.astype("float32")
 
-        segments = Undertest.vad_segment(
+        segments, _ = Undertest.vad_segment(
             audio, sample_rate=sr, frame_ms=30, aggressiveness=2, min_speech_ms=300, recipe="webrtcvad"
         )
 
@@ -449,7 +449,7 @@ def test_vad_segment_silero(self):
             audio = audio.astype("float32") / maxv
         audio = audio.astype("float32")
 
-        segments = Undertest.vad_segment(audio, sample_rate=sr, recipe="silero")
+        segments, _ = Undertest.vad_segment(audio, sample_rate=sr, recipe="silero")
 
         self.assertGreater(len(segments), 0)
         for start, end in segments: