Skip to content

Commit 79fafec

Browse files
committed
sepia vad updates
1 parent 6601060 commit 79fafec

File tree

4 files changed

+1139
-34
lines changed

4 files changed

+1139
-34
lines changed

src/modules/sepia-vad-worker.js

Lines changed: 123 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,16 @@ let _vadFrames; //each frame processes one chunk of '_vadBufferSize' as long
5454
let _vadFrameTimeMs; //real time (ms) of one vadFrame (defined by sample-rate and buffer size)
5555
let _vadBufferSize; //size of a single vadFrame (restrictions apply)
5656
let _vadBuffer;
57-
let _previousVadBuffer;
57+
//let _previousVadBuffer;
5858
let _transferFun;
5959

6060
//parameters to calculate vad
61-
let _movingAvgLoudness;
62-
let _maxLoudness = 0;
61+
let movingAvgLoudness;
62+
let maxLoudness = 0;
6363
let _movingAvgLoudnessWeight = 800; //TODO: make variable and normalize with sample-rate
64+
let _vadThreshold;
65+
let _warmUpFrames;
66+
let _totalFrames;
6467

6568
//sequence control
6669
let useSequenceAnalyzer = false;
@@ -69,11 +72,17 @@ let voiceResetTime;
6972
let silenceActivationTime;
7073
let maxSequenceTime;
7174
let minSequenceTime;
75+
let sequenceTimeForTrigger = 1100; //TODO: make variable
76+
let mfccSequenceStartBuffer;
77+
let loudnessSequenceStartBuffer;
78+
let feature1SequenceStartBuffer;
79+
let feature2SequenceStartBuffer;
80+
let feature3SequenceStartBuffer;
7281

7382
let _sequenceVoiceTime;
7483
let _sequenceSilenceTime;
7584
let _sequenceSawVoice, _sequenceSawSilenceAfterVoice, _sequenceFinishedVoice;
76-
let _sequenceIsActive, _sequenceIsDone, _sequenceStartedAt;
85+
let _sequenceIsActive, _sequenceIsDone, _sequenceStartedAt, _sequenceCheckedTrigger;
7786

7887
let _isFirstValidProcess;
7988

@@ -86,7 +95,7 @@ function init(){
8695
if (inputSampleRate < 8000 || inputSampleRate > 48000){
8796
throw JSON.stringify(new SampleRateException("For this module sample-rate has to be between 8000 and 48000 Hz."));
8897
}
89-
var allowedBufferSizes = [8192, 4096, 2048, 1024, 512, 256, 128]; //recommended: 10-30ms frame length, e.g. 256/16000 = 16ms
98+
var allowedBufferSizes = [8192, 4096, 2048, 1024, 512, 256, 128]; //recommended: 10-30ms frame length, e.g. 512/16000 = 32ms (recommended)
9099
_vadBufferSize = 0;
91100
for (let i=0; i<allowedBufferSizes.length; i++){
92101
if (processBufferSize == allowedBufferSizes[i] || processBufferSize % allowedBufferSizes[i] == 0){
@@ -107,7 +116,7 @@ function init(){
107116
var ringBufferSize = processBufferSize + inputSampleSize; //TODO: check size again
108117
_processRingBuffer = new RingBuffer(ringBufferSize, channelCount, "Float32");
109118
_vadBuffer = [new Float32Array(_vadBufferSize)];
110-
_previousVadBuffer = [new Float32Array(_vadBufferSize)];
119+
//_previousVadBuffer = [new Float32Array(_vadBufferSize)];
111120
if (isFloat32Input){
112121
//we need flot32 for Meyda so this is all good
113122
_transferFun = function(thisArray, channel, i){
@@ -120,6 +129,19 @@ function init(){
120129
}
121130
}
122131

132+
movingAvgLoudness = undefined;
133+
maxLoudness = 0;
134+
135+
if (useSequenceAnalyzer){
136+
//Buffer the start of a sequence to analyze for keywords/trigger/wake-words etc.
137+
let sequenceStartFrames = Math.round((sequenceTimeForTrigger + 1000)/1000 * (inputSampleRate/inputSampleSize)); //TODO: why is this almost 2 times more than expected?
138+
mfccSequenceStartBuffer = ArrayOps.newCommon2dArray(sequenceStartFrames, Meyda.numberOfMFCCCoefficients, 0);
139+
loudnessSequenceStartBuffer = ArrayOps.newCommon1dArray(sequenceStartFrames, 0);
140+
feature1SequenceStartBuffer = ArrayOps.newCommon1dArray(sequenceStartFrames, 0);
141+
feature2SequenceStartBuffer = ArrayOps.newCommon1dArray(sequenceStartFrames, 0);
142+
feature3SequenceStartBuffer = ArrayOps.newCommon1dArray(sequenceStartFrames, 0);
143+
}
144+
123145
resetSequence();
124146

125147
_isFirstValidProcess = true;
@@ -150,6 +172,10 @@ function constructWorker(options) {
150172
inputSampleSize = options.setup.inputSampleSize || 512;
151173
processBufferSize = options.setup.bufferSize || inputSampleSize;
152174
vadMode = (options.setup.vadMode != undefined)? options.setup.vadMode : 3;
175+
_vadThreshold = (1 + vadMode/10);
176+
_warmUpFrames = Math.round(2*inputSampleRate/inputSampleSize); //input- or processBufferSize? We want ~2s so input makes sense
177+
_totalFrames = 0;
178+
153179
isFloat32Input = (options.setup.isFloat32 != undefined)? options.setup.isFloat32 : false;
154180

155181
if (options.setup.voiceEnergyCap != undefined) voiceEnergyCap = options.setup.voiceEnergyCap;
@@ -161,6 +187,7 @@ function constructWorker(options) {
161187
silenceActivationTime = options.setup.sequence.silenceActivationTime || 250;
162188
maxSequenceTime = options.setup.sequence.maxSequenceTime || 6000;
163189
minSequenceTime = options.setup.sequence.minSequenceTime || 600;
190+
//TODO: add sequenceTimeForTrigger
164191
}else{
165192
useSequenceAnalyzer = false;
166193
}
@@ -170,7 +197,7 @@ function constructWorker(options) {
170197
Meyda.numberOfMFCCCoefficients = 13;
171198
Meyda.windowingFunction = "hanning"; //"hamming"
172199
var meydaRequiredFeatures = ["mfcc", "loudness"];
173-
//https://meyda.js.org/audio-features.html: "spectralCentroid", "spectralFlatness", "spectralFlux" (requires previous spec.)
200+
//https://meyda.js.org/audio-features.html: "spectralCentroid", "spectralFlatness", "spectralFlux" (requires previous spec. but is buggy!?)
174201
var meydaAnalyzer = options.setup.meydaAnalyzer || {};
175202
var meydaFeatures = [];
176203
var meydaSettingsKeys = Object.keys(meydaAnalyzer);
@@ -207,11 +234,11 @@ function constructWorker(options) {
207234

208235
//averages
209236
function getWeightedMovingAverage(prevAvg, nextValue, weight){
210-
if (prevAvg == undefined){
211-
return nextValue;
212-
}else{
237+
//if (prevAvg == undefined){
238+
// return nextValue;
239+
//}else{
213240
return (prevAvg + (nextValue - prevAvg)/weight);
214-
}
241+
//}
215242
}
216243

217244
//sequence block
@@ -250,9 +277,16 @@ function sequenceDetector(voiceActivity){
250277
registerEvent(3, 'finished_voice');
251278

252279
}else if (_sequenceSawVoice){
253-
if (_sequenceIsActive && ((Date.now() - _sequenceStartedAt) > maxSequenceTime)) {
254-
_sequenceIsDone = true;
255-
registerEvent(4, 'finished_voice_maxtime');
280+
if (_sequenceIsActive){
281+
let timePassed = (Date.now() - _sequenceStartedAt);
282+
if (timePassed > maxSequenceTime){
283+
_sequenceIsDone = true;
284+
registerEvent(4, 'finished_voice_maxtime');
285+
}else if (!_sequenceCheckedTrigger && timePassed >= sequenceTimeForTrigger){
286+
sequenceTriggerAnalyzer();
287+
_sequenceCheckedTrigger = true;
288+
registerEvent(6, 'sequence_trigger_result');
289+
}
256290
}
257291
}
258292

@@ -261,6 +295,33 @@ function sequenceDetector(voiceActivity){
261295
resetSequence();
262296
}
263297
}
298+
function sequenceTriggerAnalyzer(){
299+
//TODO
300+
var sqVoiceActivity = feature1SequenceStartBuffer;
301+
var sqVoiceEnergy = feature2SequenceStartBuffer;
302+
var rangeStart = 0;
303+
var rangeEnd = sqVoiceEnergy.length;
304+
//find largest activity block - TODO: not good enough, 0 is possible (try 'hey computer')
305+
var bestRange = {start: 0, end: 0, range: 0};
306+
var nextRange = {start: 0, end: 0, range: 0};
307+
for (let i=0; i<sqVoiceEnergy.length; i++){
308+
if (nextRange.range == 0 && sqVoiceEnergy[i] > 0){
309+
nextRange.start = i;
310+
nextRange.range++;
311+
}else if (sqVoiceEnergy[i] > 0){
312+
nextRange.range++;
313+
}else{
314+
nextRange.end = i;
315+
if (nextRange.range > bestRange.range){
316+
bestRange = nextRange;
317+
nextRange = {start: 0, end: 0, range: 0};
318+
}
319+
}
320+
}
321+
if (bestRange.range){
322+
//var reducedMfccBuffer = mfccSequenceStartBuffer.slice(bestRange.start, bestRange.end);
323+
}
324+
}
264325
function resetSequence(){
265326
//vad
266327
voiceEnergy = 0;
@@ -272,6 +333,7 @@ function resetSequence(){
272333
_sequenceIsActive = false;
273334
_sequenceStartedAt = 0;
274335
_sequenceIsDone = false;
336+
_sequenceCheckedTrigger = false;
275337
}
276338
function registerEvent(code, msg, data){
277339
var msg = {
@@ -287,6 +349,13 @@ function registerEvent(code, msg, data){
287349
msg.vadSequenceStarted = _sequenceStartedAt;
288350
msg.vadSequenceEnded = Date.now();
289351
break;
352+
case 6:
353+
//sequence trigger-check phase data
354+
msg.vadSequenceStarted = _sequenceStartedAt;
355+
msg.mfccProfile = mfccSequenceStartBuffer;
356+
msg.loudnessProfile = loudnessSequenceStartBuffer;
357+
msg.featuresArray = [feature1SequenceStartBuffer, feature2SequenceStartBuffer, feature3SequenceStartBuffer];
358+
msg.avgLoudness = movingAvgLoudness;
290359
default:
291360
break;
292361
}
@@ -301,6 +370,7 @@ function process(data) {
301370
//Use 1st input and output only
302371
let input = data.samples;
303372
let thisInputSampleSize = input[0].length;
373+
_totalFrames++;
304374

305375
if (_isFirstValidProcess){
306376
_isFirstValidProcess = false;
@@ -326,30 +396,41 @@ function process(data) {
326396
_processRingBuffer.push(input, _transferFun);
327397

328398
//Process if we have enough frames
329-
var vadResults = [];
330-
var loudnessResults = [];
331-
var mfcc = [];
399+
let vadFramesAvailable = Math.floor(_processRingBuffer.framesAvailable/_vadBufferSize);
400+
let vadResults = new Array(vadFramesAvailable);
401+
let loudnessResults = new Array(vadFramesAvailable);
402+
let mfcc = new Array(vadFramesAvailable);
403+
//let moreFeatures = new Array(vadFramesAvailable);
404+
let n = 0;
332405
while (_processRingBuffer.framesAvailable >= _vadBufferSize) {
333406
//pull samples
334407
_processRingBuffer.pull(_vadBuffer);
335408

336409
//Meyda features
337-
let features = Meyda.extract(Meyda.features, _vadBuffer[0], _previousVadBuffer[0]);
410+
let features = Meyda.extract(Meyda.features, _vadBuffer[0]); //we don't add '_previousVadBuffer[0]' because it saves time and 'spectralFlux' is buggy
338411
//console.log("features_meyda", features);
339-
_previousVadBuffer = _vadBuffer;
340412

341413
//let loudness = (features.loudness.specific[1] + features.loudness.specific[2] + features.loudness.specific[3]); //'specific' shows each loudness on bark scale, 'total' is the sum
342414
//let loudness = features.loudness.total;
343415
let loudness = features.loudness.specific.slice(1, 5).reduce(function(a, b){ return a + b; });
344-
_maxLoudness = Math.max(_maxLoudness, loudness);
345-
_movingAvgLoudness = getWeightedMovingAverage(_movingAvgLoudness, loudness, _movingAvgLoudnessWeight);
416+
maxLoudness = Math.max(maxLoudness, loudness);
417+
if (movingAvgLoudness == undefined){
418+
movingAvgLoudness = loudness * _vadThreshold;
419+
}
420+
if (_totalFrames < _warmUpFrames){
421+
movingAvgLoudness = getWeightedMovingAverage(movingAvgLoudness, loudness, 10);
422+
}else{
423+
movingAvgLoudness = getWeightedMovingAverage(movingAvgLoudness, loudness, _movingAvgLoudnessWeight);
424+
}
346425

347-
mfcc.push(features.mfcc);
426+
mfcc[n] = features.mfcc;
348427

349428
//activity check
350-
var voiceActivity = (loudness/_movingAvgLoudness) > (1 + vadMode/10)? 1 : 0;
351-
vadResults.push(voiceActivity);
352-
loudnessResults.push(loudness);
429+
let loudnessNorm = (loudness/movingAvgLoudness);
430+
let voiceActivity = loudnessNorm > _vadThreshold? 1 : 0;
431+
vadResults[n] = voiceActivity;
432+
loudnessResults[n] = loudness;
433+
//moreFeatures[n] = [];
353434

354435
//voice energy and sequence check
355436
if (voiceActivity){
@@ -360,19 +441,25 @@ function process(data) {
360441
if (voiceEnergy < 0) voiceEnergy = 0;
361442
}
362443
if (useSequenceAnalyzer){
444+
ArrayOps.pushAndShift(mfccSequenceStartBuffer, features.mfcc);
445+
ArrayOps.pushAndShift(loudnessSequenceStartBuffer, (loudnessNorm > 1)? (loudnessNorm-1) : 0); //loudness
446+
ArrayOps.pushAndShift(feature1SequenceStartBuffer, voiceActivity);
447+
ArrayOps.pushAndShift(feature2SequenceStartBuffer, voiceEnergy);
448+
ArrayOps.pushAndShift(feature3SequenceStartBuffer, 0);
363449
sequenceDetector(voiceActivity);
364450
}
451+
n++;
365452
}
366-
if (vadResults.length > 0){
453+
if (n > 0){
367454
//Send info
368-
//console.log("features", vadResults, loudnessResults, _movingAvgLoudness, _maxLoudness);
455+
//console.log("features", vadResults, loudnessResults, movingAvgLoudness, maxLoudness);
369456
postMessage({
370457
voiceActivity: vadResults,
371458
voiceEnergy: voiceEnergy,
372459
voiceLoudness: loudnessResults,
373460
mfcc: mfcc,
374-
movingAvgLoudness: _movingAvgLoudness,
375-
maxLoudness: _maxLoudness
461+
movingAvgLoudness: movingAvgLoudness,
462+
maxLoudness: maxLoudness
376463
});
377464
}
378465
}
@@ -400,9 +487,13 @@ function release(options){
400487
//destroy
401488
_processRingBuffer = null;
402489
_vadBuffer = null;
403-
_previousVadBuffer = null;
404-
_movingAvgLoudness = undefined;
405-
_maxLoudness = 0;
490+
//_previousVadBuffer = null;
491+
_totalFrames = 0;
492+
mfccSequenceStartBuffer = null;
493+
loudnessSequenceStartBuffer = null;
494+
feature1SequenceStartBuffer = null;
495+
feature2SequenceStartBuffer = null;
496+
feature3SequenceStartBuffer = null;
406497
}
407498

408499
//--- helpers ---

src/modules/shared/common.js

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,38 @@ function ChannelCountException(message){
2121
this.name = "ChannelCountException";
2222
}
2323

24+
//Array operations
25+
26+
var ArrayOps = {};
27+
28+
ArrayOps.newCommon1dArray = function(n, startValue){
29+
if (startValue == undefined) startValue = 0;
30+
var array = new Array(n);
31+
for (let i=0; i<n; i++){
32+
array[i] = startValue;
33+
}
34+
return array;
35+
}
36+
ArrayOps.newCommon2dArray = function(n, m, startValue){
37+
if (startValue == undefined) startValue = 0;
38+
var array = new Array(n);
39+
for (let i=0; i<n; i++){
40+
array[i] = new Array(m);
41+
for (let j=0; j<m; j++){
42+
array[i][j] = startValue;
43+
}
44+
}
45+
return array;
46+
}
47+
ArrayOps.pushAndShift = function(array, pushValue){
48+
//NOTE: this operation does not need to allocate memory compared to shift().push()
49+
for (let i=0; i<(array.length - 1); i++){
50+
array[i] = array[i+1];
51+
}
52+
array[array.length - 1] = pushValue;
53+
return array;
54+
};
55+
2456
//Converters
2557

2658
var CommonConverters = {};

0 commit comments

Comments
 (0)