Skip to content

Commit 079c6da

Browse files
authored
Replace microVAD with Silero VAD (ggml) (home-assistant#158282)
1 parent b120ae8 commit 079c6da

File tree

7 files changed

+50
-21
lines changed

7 files changed

+50
-21
lines changed

homeassistant/components/assist_pipeline/audio_enhancer.py

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
from abc import ABC, abstractmethod
44
from dataclasses import dataclass
55
import logging
6+
import math
67

7-
from pymicro_vad import MicroVad
8+
from pysilero_vad import SileroVoiceActivityDetector
89
from pyspeex_noise import AudioProcessor
910

1011
from .const import BYTES_PER_CHUNK
@@ -42,8 +43,8 @@ def enhance_chunk(self, audio: bytes, timestamp_ms: int) -> EnhancedAudioChunk:
4243
"""Enhance chunk of PCM audio @ 16Khz with 16-bit mono samples."""
4344

4445

45-
class MicroVadSpeexEnhancer(AudioEnhancer):
46-
"""Audio enhancer that runs microVAD and speex."""
46+
class SileroVadSpeexEnhancer(AudioEnhancer):
47+
"""Audio enhancer that runs Silero VAD and speex."""
4748

4849
def __init__(
4950
self, auto_gain: int, noise_suppression: int, is_vad_enabled: bool
@@ -69,21 +70,49 @@ def __init__(
6970
self.noise_suppression,
7071
)
7172

72-
self.vad: MicroVad | None = None
73+
self.vad: SileroVoiceActivityDetector | None = None
74+
75+
# We get 10ms chunks but Silero works on 32ms chunks, so we have to
76+
# buffer audio. The previous speech probability is used until enough
77+
# audio has been buffered.
78+
self._vad_buffer: bytearray | None = None
79+
self._vad_buffer_chunks = 0
80+
self._vad_buffer_chunk_idx = 0
81+
self._last_speech_probability: float | None = None
7382

7483
if self.is_vad_enabled:
75-
self.vad = MicroVad()
76-
_LOGGER.debug("Initialized microVAD")
84+
self.vad = SileroVoiceActivityDetector()
85+
86+
# VAD buffer is a multiple of 10ms, but Silero VAD needs 32ms.
87+
self._vad_buffer_chunks = int(
88+
math.ceil(self.vad.chunk_bytes() / BYTES_PER_CHUNK)
89+
)
90+
self._vad_leftover_bytes = self.vad.chunk_bytes() - BYTES_PER_CHUNK
91+
self._vad_buffer = bytearray(self.vad.chunk_bytes())
92+
_LOGGER.debug("Initialized Silero VAD")
7793

7894
def enhance_chunk(self, audio: bytes, timestamp_ms: int) -> EnhancedAudioChunk:
7995
"""Enhance 10ms chunk of PCM audio @ 16Khz with 16-bit mono samples."""
80-
speech_probability: float | None = None
81-
8296
assert len(audio) == BYTES_PER_CHUNK
8397

8498
if self.vad is not None:
8599
# Run VAD
86-
speech_probability = self.vad.Process10ms(audio)
100+
assert self._vad_buffer is not None
101+
start_idx = self._vad_buffer_chunk_idx * BYTES_PER_CHUNK
102+
self._vad_buffer[start_idx : start_idx + BYTES_PER_CHUNK] = audio
103+
104+
self._vad_buffer_chunk_idx += 1
105+
if self._vad_buffer_chunk_idx >= self._vad_buffer_chunks:
106+
# We have enough data to run Silero VAD (32 ms)
107+
self._last_speech_probability = self.vad.process_chunk(
108+
self._vad_buffer[: self.vad.chunk_bytes()]
109+
)
110+
111+
# Copy leftover audio that wasn't processed to start
112+
self._vad_buffer[: self._vad_leftover_bytes] = self._vad_buffer[
113+
-self._vad_leftover_bytes :
114+
]
115+
self._vad_buffer_chunk_idx = 0
87116

88117
if self.audio_processor is not None:
89118
# Run noise suppression and auto gain
@@ -92,5 +121,5 @@ def enhance_chunk(self, audio: bytes, timestamp_ms: int) -> EnhancedAudioChunk:
92121
return EnhancedAudioChunk(
93122
audio=audio,
94123
timestamp_ms=timestamp_ms,
95-
speech_probability=speech_probability,
124+
speech_probability=self._last_speech_probability,
96125
)

homeassistant/components/assist_pipeline/manifest.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,5 @@
88
"integration_type": "system",
99
"iot_class": "local_push",
1010
"quality_scale": "internal",
11-
"requirements": ["pymicro-vad==1.0.1", "pyspeex-noise==1.0.2"]
11+
"requirements": ["pysilero-vad==3.0.0", "pyspeex-noise==1.0.2"]
1212
}

homeassistant/components/assist_pipeline/pipeline.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
from homeassistant.util.hass_dict import HassKey
5656
from homeassistant.util.limited_size_dict import LimitedSizeDict
5757

58-
from .audio_enhancer import AudioEnhancer, EnhancedAudioChunk, MicroVadSpeexEnhancer
58+
from .audio_enhancer import AudioEnhancer, EnhancedAudioChunk, SileroVadSpeexEnhancer
5959
from .const import (
6060
ACKNOWLEDGE_PATH,
6161
BYTES_PER_CHUNK,
@@ -633,7 +633,7 @@ def __post_init__(self) -> None:
633633
# Initialize with audio settings
634634
if self.audio_settings.needs_processor and (self.audio_enhancer is None):
635635
# Default audio enhancer
636-
self.audio_enhancer = MicroVadSpeexEnhancer(
636+
self.audio_enhancer = SileroVadSpeexEnhancer(
637637
self.audio_settings.auto_gain_dbfs,
638638
self.audio_settings.noise_suppression_level,
639639
self.audio_settings.is_vad_enabled,

homeassistant/package_constraints.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,10 @@ Pillow==12.0.0
5353
propcache==0.4.1
5454
psutil-home-assistant==0.0.1
5555
PyJWT==2.10.1
56-
pymicro-vad==1.0.1
5756
PyNaCl==1.6.0
5857
pyOpenSSL==25.3.0
5958
pyserial==3.5
59+
pysilero-vad==3.0.0
6060
pyspeex-noise==1.0.2
6161
python-slugify==8.0.4
6262
PyTurboJPEG==1.8.0

requirements_all.txt

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

requirements_test_all.txt

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

script/hassfest/docker/Dockerfile

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)