@@ -27,6 +27,10 @@ import (
2727 ort "github.com/yalue/onnxruntime_go"
2828)
2929
30+ // MelStep is the number of new audio samples (at 16 kHz) between successive
31+ // mel-spectrogram frames. OpenWakeWord uses a 10 ms step, i.e. 160 samples.
32+ const melStepSamples = 160
33+
3034// Global ONNX Runtime initialization (must only happen once)
3135var (
3236 ortInitOnce sync.Once
@@ -55,14 +59,13 @@ func newLightSessionOptions() (*ort.SessionOptions, error) {
5559}
5660
5761const (
58- TargetSampleRate = 16000
59- MelBins = 32
60- EmbeddingSize = 96
61- MelWindowSize = 76
62- MelWindowStep = 8
63- WakeWordFeatures = 16
64- DefaultCooldownMs = 2000
65- ProcessStepSamples = 3200 // ~200ms of new audio before re-running inference
62+ TargetSampleRate = 16000
63+ MelBins = 32
64+ EmbeddingSize = 96
65+ MelWindowSize = 76
66+ MelWindowStep = 8
67+ WakeWordFeatures = 16
68+ DefaultCooldownMs = 2000
6669)
6770
6871// ModelConfig represents a wake word model configuration
@@ -95,10 +98,10 @@ type Detector struct {
9598 wakeWordSessions map [string ]* wakeWordModel
9699 activeModelID string
97100
98- audioBuffer []int16
99- newSamplesSinceLast int // samples added since last processBuffer
100- lastDetectionTime time.Time
101- mu sync.Mutex
101+ audioBuffer []int16
102+ processedSamples int
103+ lastDetectionTime time.Time
104+ mu sync.Mutex
102105
103106 // Callbacks
104107 onDetected func (modelID string )
@@ -234,19 +237,25 @@ func (d *Detector) ProcessAudio(samples []int16) error {
234237
235238 // Add to buffer
236239 d .audioBuffer = append (d .audioBuffer , samples ... )
237- d .newSamplesSinceLast += len (samples )
238240
239241 // Keep max ~5 seconds of audio
240242 maxAudioLength := TargetSampleRate * 5
241243 if len (d .audioBuffer ) > maxAudioLength {
242- d .audioBuffer = d .audioBuffer [len (d .audioBuffer )- maxAudioLength :]
244+ excess := len (d .audioBuffer ) - maxAudioLength
245+ d .audioBuffer = d .audioBuffer [excess :]
246+ d .processedSamples -= excess
247+ if d .processedSamples < 0 {
248+ d .processedSamples = 0
249+ }
243250 }
244251
245- // Process when we have enough audio (at least 2 seconds) AND
246- // enough new samples since last processing to avoid running
247- // inference on every incoming frame.
252+ // Run inference whenever there are enough new samples to produce at
253+ // least one new mel-spectrogram frame (160 samples = 10 ms at 16 kHz).
254+ // This keeps detection latency as low as the model allows while still
255+ // avoiding redundant re-computation on identical audio.
248256 minSamples := TargetSampleRate * 2
249- if len (d .audioBuffer ) >= minSamples && d .newSamplesSinceLast >= ProcessStepSamples {
257+ newSamples := len (d .audioBuffer ) - d .processedSamples
258+ if len (d .audioBuffer ) >= minSamples && newSamples >= melStepSamples {
250259 return d .processBuffer ()
251260 }
252261
@@ -270,7 +279,7 @@ func (d *Detector) ProcessAudioFloat32(samples []float32) error {
270279}
271280
272281func (d * Detector ) processBuffer () error {
273- d .newSamplesSinceLast = 0
282+ d .processedSamples = len ( d . audioBuffer )
274283
275284 startTime := time .Now ()
276285
@@ -369,6 +378,7 @@ func (d *Detector) processBuffer() error {
369378 if len (d .audioBuffer ) > keepSamples {
370379 d .audioBuffer = d .audioBuffer [len (d .audioBuffer )- keepSamples :]
371380 }
381+ d .processedSamples = len (d .audioBuffer )
372382
373383 return nil
374384}
0 commit comments