Skip to content

Commit bb5b4af

Browse files
committed
fix: Wake word activation after changes
1 parent fa51615 commit bb5b4af

File tree

2 files changed

+31
-21
lines changed

2 files changed

+31
-21
lines changed

models/wakeword/wakewords.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ models:
1010
name: Oye Magec
1111
file: oye-magec.onnx
1212
phrase: Oye Magec
13-
threshold: 0.5
13+
threshold: 0.15
1414
annotations:
1515
language: es
1616
trained_with: openwakeword
@@ -23,7 +23,7 @@ models:
2323
name: Magec
2424
file: magec.onnx
2525
phrase: Magec
26-
threshold: 0.3
26+
threshold: 0.1
2727
annotations:
2828
language: es
2929
trained_with: openwakeword

server/voice/detector.go

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ import (
2727
ort "github.com/yalue/onnxruntime_go"
2828
)
2929

30+
// MelStep is the number of new audio samples (at 16 kHz) between successive
31+
// mel-spectrogram frames. OpenWakeWord uses a 10 ms step, i.e. 160 samples.
32+
const melStepSamples = 160
33+
3034
// Global ONNX Runtime initialization (must only happen once)
3135
var (
3236
ortInitOnce sync.Once
@@ -55,14 +59,13 @@ func newLightSessionOptions() (*ort.SessionOptions, error) {
5559
}
5660

5761
const (
58-
TargetSampleRate = 16000
59-
MelBins = 32
60-
EmbeddingSize = 96
61-
MelWindowSize = 76
62-
MelWindowStep = 8
63-
WakeWordFeatures = 16
64-
DefaultCooldownMs = 2000
65-
ProcessStepSamples = 3200 // ~200ms of new audio before re-running inference
62+
TargetSampleRate = 16000
63+
MelBins = 32
64+
EmbeddingSize = 96
65+
MelWindowSize = 76
66+
MelWindowStep = 8
67+
WakeWordFeatures = 16
68+
DefaultCooldownMs = 2000
6669
)
6770

6871
// ModelConfig represents a wake word model configuration
@@ -95,10 +98,10 @@ type Detector struct {
9598
wakeWordSessions map[string]*wakeWordModel
9699
activeModelID string
97100

98-
audioBuffer []int16
99-
newSamplesSinceLast int // samples added since last processBuffer
100-
lastDetectionTime time.Time
101-
mu sync.Mutex
101+
audioBuffer []int16
102+
processedSamples int
103+
lastDetectionTime time.Time
104+
mu sync.Mutex
102105

103106
// Callbacks
104107
onDetected func(modelID string)
@@ -234,19 +237,25 @@ func (d *Detector) ProcessAudio(samples []int16) error {
234237

235238
// Add to buffer
236239
d.audioBuffer = append(d.audioBuffer, samples...)
237-
d.newSamplesSinceLast += len(samples)
238240

239241
// Keep max ~5 seconds of audio
240242
maxAudioLength := TargetSampleRate * 5
241243
if len(d.audioBuffer) > maxAudioLength {
242-
d.audioBuffer = d.audioBuffer[len(d.audioBuffer)-maxAudioLength:]
244+
excess := len(d.audioBuffer) - maxAudioLength
245+
d.audioBuffer = d.audioBuffer[excess:]
246+
d.processedSamples -= excess
247+
if d.processedSamples < 0 {
248+
d.processedSamples = 0
249+
}
243250
}
244251

245-
// Process when we have enough audio (at least 2 seconds) AND
246-
// enough new samples since last processing to avoid running
247-
// inference on every incoming frame.
252+
// Run inference whenever there are enough new samples to produce at
253+
// least one new mel-spectrogram frame (160 samples = 10 ms at 16 kHz).
254+
// This keeps detection latency as low as the model allows while still
255+
// avoiding redundant re-computation on identical audio.
248256
minSamples := TargetSampleRate * 2
249-
if len(d.audioBuffer) >= minSamples && d.newSamplesSinceLast >= ProcessStepSamples {
257+
newSamples := len(d.audioBuffer) - d.processedSamples
258+
if len(d.audioBuffer) >= minSamples && newSamples >= melStepSamples {
250259
return d.processBuffer()
251260
}
252261

@@ -270,7 +279,7 @@ func (d *Detector) ProcessAudioFloat32(samples []float32) error {
270279
}
271280

272281
func (d *Detector) processBuffer() error {
273-
d.newSamplesSinceLast = 0
282+
d.processedSamples = len(d.audioBuffer)
274283

275284
startTime := time.Now()
276285

@@ -369,6 +378,7 @@ func (d *Detector) processBuffer() error {
369378
if len(d.audioBuffer) > keepSamples {
370379
d.audioBuffer = d.audioBuffer[len(d.audioBuffer)-keepSamples:]
371380
}
381+
d.processedSamples = len(d.audioBuffer)
372382

373383
return nil
374384
}

0 commit comments

Comments
 (0)