Skip to content

Commit f059519

Browse files
committed
changed the way 'vadMode' works for SEPIA VAD worker
1 parent 79fafec commit f059519

File tree

3 files changed

+124
-55
lines changed

3 files changed

+124
-55
lines changed

src/modules/sepia-vad-worker.js

Lines changed: 77 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,16 @@ let channelCount;
4242
let inputSampleSize;
4343
let processBufferSize; //defines '_processRingBuffer' size together with 'inputSampleSize'
4444
let vadMode;
45+
let vadThreshold;
46+
let vadDefaultThresholds = {
47+
1: 1.5,
48+
2: 3.5,
49+
3: 3.5
50+
}
4551
let isFloat32Input; //default false
4652

4753
let voiceEnergy;
48-
let voiceEnergyCap = 50;
54+
let voiceEnergyCap = 42;
4955
let voiceEnergyDropRate = 2;
5056
let _samplesToTimeMsFactor;
5157

@@ -60,8 +66,9 @@ let _transferFun;
6066
//parameters to calculate vad
6167
let movingAvgLoudness;
6268
let maxLoudness = 0;
63-
let _movingAvgLoudnessWeight = 800; //TODO: make variable and normalize with sample-rate
64-
let _vadThreshold;
69+
let _movingAvgLoudnessWeight = 400; //TODO: make variable and normalize with sample-rate
70+
let _mfccDynamicWeightsArray;
71+
let _mfccLastArray;
6572
let _warmUpFrames;
6673
let _totalFrames;
6774

@@ -98,15 +105,22 @@ function init(){
98105
var allowedBufferSizes = [8192, 4096, 2048, 1024, 512, 256, 128]; //recommended: 10-30ms frame length, e.g. 512/16000 = 32ms (recommended)
99106
_vadBufferSize = 0;
100107
for (let i=0; i<allowedBufferSizes.length; i++){
101-
if (processBufferSize == allowedBufferSizes[i] || processBufferSize % allowedBufferSizes[i] == 0){
102-
_vadFrames = processBufferSize / allowedBufferSizes[i];
103-
_vadBufferSize = allowedBufferSizes[i];
104-
break;
108+
//common cases
109+
if (inputSampleRate == 16000 && processBufferSize >= 512){
110+
_vadBufferSize = 512;
111+
112+
//best fallback
113+
}else{
114+
if (processBufferSize == allowedBufferSizes[i] || processBufferSize % allowedBufferSizes[i] == 0){
115+
_vadBufferSize = allowedBufferSizes[i];
116+
break;
117+
}
105118
}
106119
}
107120
if (_vadBufferSize == 0){
108121
throw JSON.stringify(new BufferSizeException("The 'bufferSize' has to be equal or a multiple of: " + allowedBufferSizes.join(", ")));
109122
}else{
123+
_vadFrames = processBufferSize / _vadBufferSize;
110124
_samplesToTimeMsFactor = 1000/inputSampleRate;
111125
_vadFrameTimeMs = Math.round(_vadBufferSize * _samplesToTimeMsFactor);
112126
if (_vadFrameTimeMs < 5 || _vadFrameTimeMs > 86){
@@ -129,12 +143,21 @@ function init(){
129143
}
130144
}
131145

146+
//Meyda requirements (2)
147+
Meyda.sampleRate = inputSampleRate;
148+
Meyda.bufferSize = _vadBufferSize;
149+
if (!Meyda.bufferSize || (Meyda.bufferSize & (Meyda.bufferSize -1) != 0)){
150+
throw JSON.stringify({name: "VadModuleMeydaError", message: "Meyda buffer-size must be power of 2, e.g. 128, 256, 512, 1024, ..."});
151+
}
152+
132153
movingAvgLoudness = undefined;
133154
maxLoudness = 0;
155+
_mfccDynamicWeightsArray = ArrayOps.newCommon1dArray(Meyda.numberOfMFCCCoefficients, 1);
156+
_mfccLastArray = ArrayOps.newCommon1dArray(Meyda.numberOfMFCCCoefficients, 0);
134157

135158
if (useSequenceAnalyzer){
136159
//Buffer the start of a sequence to analyze for keywords/trigger/wake-words etc.
137-
let sequenceStartFrames = Math.round((sequenceTimeForTrigger + 1000)/1000 * (inputSampleRate/inputSampleSize)); //TODO: why is this almost 2 times more than expected?
160+
let sequenceStartFrames = Math.round((sequenceTimeForTrigger + 1000)/1000 * (inputSampleRate/_vadBufferSize)); //TODO: why is this almost 2 times more than expected?
138161
mfccSequenceStartBuffer = ArrayOps.newCommon2dArray(sequenceStartFrames, Meyda.numberOfMFCCCoefficients, 0);
139162
loudnessSequenceStartBuffer = ArrayOps.newCommon1dArray(sequenceStartFrames, 0);
140163
feature1SequenceStartBuffer = ArrayOps.newCommon1dArray(sequenceStartFrames, 0);
@@ -156,6 +179,7 @@ function ready(){
156179
inputIsFloat32: isFloat32Input,
157180
processBufferSize: processBufferSize,
158181
vadMode: vadMode,
182+
vadThreshold: vadThreshold,
159183
vadFramesMax: _vadFrames,
160184
vadBufferSize: _vadBufferSize,
161185
vadFrameTimeMs: _vadFrameTimeMs,
@@ -171,8 +195,8 @@ function constructWorker(options) {
171195
channelCount = 1; //options.setup.channelCount || 1; //TODO: only MONO atm
172196
inputSampleSize = options.setup.inputSampleSize || 512;
173197
processBufferSize = options.setup.bufferSize || inputSampleSize;
174-
vadMode = (options.setup.vadMode != undefined)? options.setup.vadMode : 3;
175-
_vadThreshold = (1 + vadMode/10);
198+
vadMode = options.setup.vadMode || 3;
199+
vadThreshold = options.setup.vadThreshold || vadDefaultThresholds[vadMode] || 3;
176200
_warmUpFrames = Math.round(2*inputSampleRate/inputSampleSize); //input- or processBufferSize? We want ~2s so input makes sense
177201
_totalFrames = 0;
178202

@@ -193,8 +217,8 @@ function constructWorker(options) {
193217
}
194218

195219
//Meyda options and defaults
196-
Meyda.melBands = 26;
197-
Meyda.numberOfMFCCCoefficients = 13;
220+
Meyda.melBands = 40; //40 26;
221+
Meyda.numberOfMFCCCoefficients = 20; //13;
198222
Meyda.windowingFunction = "hanning"; //"hamming"
199223
var meydaRequiredFeatures = ["mfcc", "loudness"];
200224
//https://meyda.js.org/audio-features.html: "spectralCentroid", "spectralFlatness", "spectralFlux" (requires previous spec. but is buggy!?)
@@ -211,12 +235,7 @@ function constructWorker(options) {
211235
}
212236
});
213237
}
214-
//Meyda requirements
215-
Meyda.sampleRate = inputSampleRate;
216-
Meyda.bufferSize = inputSampleSize;
217-
if (!Meyda.bufferSize || (Meyda.bufferSize & (Meyda.bufferSize -1) != 0)){
218-
throw {name: "VadModuleMeydaError", message: "Meyda buffer-size must be power of 2, e.g. 128, 256, 512, 1024, ..."};
219-
}
238+
//Meyda requirements (1)
220239
if (!meydaFeatures){
221240
Meyda.features = meydaRequiredFeatures;
222241
}else{
@@ -335,11 +354,11 @@ function resetSequence(){
335354
_sequenceIsDone = false;
336355
_sequenceCheckedTrigger = false;
337356
}
338-
function registerEvent(code, msg, data){
357+
function registerEvent(code, _msg, data){
339358
var msg = {
340359
vadSequenceCode: code,
341-
vadSequenceMsg: msg
342-
};
360+
vadSequenceMsg: _msg
361+
}
343362
switch (code){
344363
//case 1: voice start
345364
//case 2: sequence start
@@ -363,6 +382,27 @@ function registerEvent(code, msg, data){
363382
postMessage(msg);
364383
}
365384

385+
//classify voice activity
386+
function getVoiceActivity(mfccArray, loudnessNorm, averageLoudness){
387+
if (vadMode == 3){
388+
var sum = 0;
389+
for (let i=0; i<mfccArray.length; i++){
390+
let change = Math.abs(mfccArray[i]/_mfccLastArray[i] - 1);
391+
if (change < 0.20){
392+
_mfccDynamicWeightsArray[i] = _mfccDynamicWeightsArray[i] * 0.66;
393+
}else{
394+
_mfccDynamicWeightsArray[i] = 1.0; //Math.min(1.0, _mfccDynamicWeightsArray[i] + 0.75);
395+
}
396+
_mfccLastArray[i] = mfccArray[i];
397+
sum += Math.abs(mfccArray[i] * _mfccDynamicWeightsArray[i]);
398+
}
399+
var signal = sum/mfccArray.length - averageLoudness;
400+
return (signal > vadThreshold? 1 : 0);
401+
}else{
402+
return (loudnessNorm > vadThreshold? 1 : 0);
403+
}
404+
}
405+
366406
function process(data) {
367407
//expected: data.samples, data.sampleRate, data.channels, data.type
368408
//might have: data.rms - TODO: make use of?
@@ -402,34 +442,38 @@ function process(data) {
402442
let mfcc = new Array(vadFramesAvailable);
403443
//let moreFeatures = new Array(vadFramesAvailable);
404444
let n = 0;
405-
while (_processRingBuffer.framesAvailable >= _vadBufferSize) {
445+
while (_processRingBuffer.framesAvailable >= _vadBufferSize){
406446
//pull samples
407447
_processRingBuffer.pull(_vadBuffer);
408448

409449
//Meyda features
410450
let features = Meyda.extract(Meyda.features, _vadBuffer[0]); //we don't add '_previousVadBuffer[0]' because it saves time and 'spectralFlux' is buggy
411451
//console.log("features_meyda", features);
412452

413-
//let loudness = (features.loudness.specific[1] + features.loudness.specific[2] + features.loudness.specific[3]); //'specific' shows each loudness on bark scale, 'total' is the sum
414-
//let loudness = features.loudness.total;
415-
let loudness = features.loudness.specific.slice(1, 5).reduce(function(a, b){ return a + b; });
453+
let loudness;
454+
//loudness = (features.loudness.specific[1] + features.loudness.specific[2] + features.loudness.specific[3]); //'specific' shows each loudness on bark scale, 'total' is the sum
455+
//loudness = features.loudness.total;
456+
if (vadMode == 1){
457+
loudness = features.loudness.specific.slice(1, 5).reduce(function(a, b){ return a + b; }); //1-5 on the bark scale
458+
}else{
459+
loudness = features.mfcc.reduce(function(a, c){ return (a + Math.abs(c)); })/features.mfcc.length; //avg(...abs(MFCC[i]))
460+
}
416461
maxLoudness = Math.max(maxLoudness, loudness);
417462
if (movingAvgLoudness == undefined){
418-
movingAvgLoudness = loudness * _vadThreshold;
463+
movingAvgLoudness = loudness * vadThreshold;
419464
}
420465
if (_totalFrames < _warmUpFrames){
421466
movingAvgLoudness = getWeightedMovingAverage(movingAvgLoudness, loudness, 10);
422467
}else{
423468
movingAvgLoudness = getWeightedMovingAverage(movingAvgLoudness, loudness, _movingAvgLoudnessWeight);
424469
}
425470

426-
mfcc[n] = features.mfcc;
427-
428471
//activity check
429-
let loudnessNorm = (loudness/movingAvgLoudness);
430-
let voiceActivity = loudnessNorm > _vadThreshold? 1 : 0;
472+
let loudnessNorm = (loudness - movingAvgLoudness);
473+
let voiceActivity = getVoiceActivity(features.mfcc, loudnessNorm, movingAvgLoudness);
431474
vadResults[n] = voiceActivity;
432475
loudnessResults[n] = loudness;
476+
mfcc[n] = features.mfcc;
433477
//moreFeatures[n] = [];
434478

435479
//voice energy and sequence check
@@ -442,10 +486,10 @@ function process(data) {
442486
}
443487
if (useSequenceAnalyzer){
444488
ArrayOps.pushAndShift(mfccSequenceStartBuffer, features.mfcc);
445-
ArrayOps.pushAndShift(loudnessSequenceStartBuffer, (loudnessNorm > 1)? (loudnessNorm-1) : 0); //loudness
489+
ArrayOps.pushAndShift(loudnessSequenceStartBuffer, loudnessNorm); //(loudnessNorm > 1)? (loudnessNorm-1) : 0); //loudness
446490
ArrayOps.pushAndShift(feature1SequenceStartBuffer, voiceActivity);
447491
ArrayOps.pushAndShift(feature2SequenceStartBuffer, voiceEnergy);
448-
ArrayOps.pushAndShift(feature3SequenceStartBuffer, 0);
492+
ArrayOps.pushAndShift(feature3SequenceStartBuffer, features.loudness.specific);
449493
sequenceDetector(voiceActivity);
450494
}
451495
n++;
@@ -490,6 +534,7 @@ function release(options){
490534
//_previousVadBuffer = null;
491535
_totalFrames = 0;
492536
mfccSequenceStartBuffer = null;
537+
_mfccDynamicWeightsArray = null;
493538
loudnessSequenceStartBuffer = null;
494539
feature1SequenceStartBuffer = null;
495540
feature2SequenceStartBuffer = null;

test-commons.js

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -112,13 +112,19 @@ function plotData(data, plotIndex, expandData){
112112
}
113113
}else{
114114
var ele = addChartContainerToPage();
115-
var x = uPlot.lazy.createSequence(0, data.length);
116-
uPlot.lazy.plot({
115+
var conf = {
117116
targetElement: ele,
118117
showPoints: false,
119-
strokeWidth: 1,
120-
data: [x, data]
121-
});
118+
strokeWidth: 1
119+
}
120+
if (expandData){
121+
var x = uPlot.lazy.createSequence(0, data[0].length);
122+
conf.data = [x, ...data];
123+
}else{
124+
var x = uPlot.lazy.createSequence(0, data.length);
125+
conf.data = [x, data];
126+
}
127+
uPlot.lazy.plot(conf);
122128
}
123129
}
124130
function drawHeatmap(data, hmIndex, maxPoints){
@@ -138,4 +144,12 @@ function drawHeatmap(data, hmIndex, maxPoints){
138144
heatmap.addDataArray(d);
139145
});
140146
heatmap.draw();
147+
}
148+
function createArrayWithStartValue(n, startValue){
149+
if (startValue == undefined) startValue = 0;
150+
var array = new Array(n);
151+
for (let i=0; i<n; i++){
152+
array[i] = startValue;
153+
}
154+
return array;
141155
}

test2.html

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,23 @@
66
<meta name="viewport" content="width=device-width, initial-scale=1">
77
<title>SEPIA Web Audio Processor Test</title>
88

9-
<script src="src/visualization/uPlot.iife.min.js"></script>
10-
<script src="src/visualization/uPlot-lazy.min.js"></script>
11-
<script src="src/visualization/uPlot-lazy-heatmap.min.js"></script>
12-
<link rel="stylesheet" href="src/visualization/uPlot.min.css">
9+
<script src="src/visualization/uPlot.iife.min.js?v=0.6.9"></script>
10+
<script src="src/visualization/uPlot-lazy.min.js?v=0.6.9"></script>
11+
<script src="src/visualization/uPlot-lazy-heatmap.min.js?v=0.6.9"></script>
12+
<link rel="stylesheet" href="src/visualization/uPlot.min.css?v=0.6.9">
1313

14-
<script src="src/polyfills/audioworklet-polyfill.js"></script>
15-
<script type="text/javascript" src="src/processor.js"></script>
14+
<script src="src/polyfills/audioworklet-polyfill.js?v=0.6.9"></script>
15+
<script type="text/javascript" src="src/processor.js?v=0.6.9"></script>
1616
<script>
1717
//set correct modules folder
1818
SepiaFW.webAudio.defaultProcessorOptions.moduleFolder = "src/modules";
1919
</script>
2020

21-
<script type="text/javascript" src="src/resources/pcm-convert.js"></script>
22-
<script type="text/javascript" src="src/resources/opus-file-splitter.js"></script>
23-
<script type="text/javascript" src="src/resources/fir-filter-resampler.js"></script>
21+
<script type="text/javascript" src="src/resources/pcm-convert.js?v=0.6.9"></script>
22+
<script type="text/javascript" src="src/resources/opus-file-splitter.js?v=0.6.9"></script>
23+
<script type="text/javascript" src="src/resources/fir-filter-resampler.js?v=0.6.9"></script>
2424

25-
<link rel="stylesheet" type="text/css" href="tests.css">
25+
<link rel="stylesheet" type="text/css" href="tests.css?v=0.6.9">
2626
<style></style>
2727
</head>
2828
<body>
@@ -73,7 +73,7 @@ <h1>SEPIA Web Audio Processor</h1>
7373
<label><b>Resampler (Speex WASM)</b></label>
7474
<label>Sample-Rate:</label><input id="resamplerSampleRate" class="small" value="16000" onchange="setResamplerSampleRate(this);" placeholder="16000">
7575
<label>Buffer-Size:</label><input id="resamplerBufferSize" class="small" value="512" onchange="setResamplerBufferSize(this);" placeholder="512"><div style="flex: 1 0 100%;"></div>
76-
<label>Quality:</label><input id="resampleQuality" class="small" value="7" onchange="setResampleQuality(this);" placeholder="0-10 (0: fastest, 10: best)">
76+
<label>Quality:</label><input id="resampleQuality" class="small" value="3" onchange="setResampleQuality(this);" placeholder="0-10 (0: fastest, 10: best)">
7777
<div id="resamplerGainBox" class="slidecontainer" style="display: none;">
7878
<label>Gain:</label><input id="resamplerGain" type="range" min="1" max="50" step="1" value="1" class="slider"><input type="number" id="resamplerGainShow" class="small">
7979
</div>
@@ -85,7 +85,8 @@ <h1>SEPIA Web Audio Processor</h1>
8585
<div id="vadWorkerControls" class="contorlGroup">
8686
<label><b>VAD</b></label>
8787
<label>Buffer-Size:</label><input id="vadWorkerBufferSize" class="small" value="512" placeholder="960" onchange="setVadWorkerBufferSize(+this.value);">
88-
<label>Mode:</label><input id="vadWorkerVadMode" class="small" value="3" placeholder="1-3 (3: agressive)" onchange="setVadWorkerVadMode(+this.value);">
88+
<label>Mode:</label><input id="vadWorkerVadMode" class="small" type="number" value="1" placeholder="1-3 (3: agressive)" onchange="setVadWorkerVadMode(+this.value);">
89+
<label>Threshold:</label><input id="vadWorkerVadThreshold" class="small" type="number" value="0" step="0.1" title="e.g. ~1.3 (mode 1) or ~4 (mode 2-3)" onchange="setVadWorkerVadThreshold(+this.value);">
8990
</div>
9091
<div id="wakeWordWorkerControls" class="contorlGroup">
9192
<label><b>Wake-Word</b></label>
@@ -340,6 +341,7 @@ <h1>SEPIA Web Audio Processor</h1>
340341
inputSampleSize: resamplerBufferSize, //output bufferSize of previous module
341342
bufferSize: vadWorkerBufferSize,
342343
vadMode: vadWorkerVadMode,
344+
vadThreshold: vadWorkerVadThreshold,
343345
//voiceEnergyCap: 50,
344346
//voiceEnergyDropRate: 2,
345347
sequence: {
@@ -597,8 +599,10 @@ <h1>SEPIA Web Audio Processor</h1>
597599
}
598600
function setVoiceActivity(data){
599601
//console.log("vad", data);
600-
if (data.voiceLoudness != undefined && data.voiceLoudness.length == 1){
601-
plotData([[data.voiceLoudness[0]/data.movingAvgLoudness], 1, [data.maxLoudness/data.movingAvgLoudness]], 3, true); //[data.movingAvgLoudness]
602+
if (data.voiceLoudness != undefined){
603+
data.voiceLoudness.forEach(function(ld){
604+
plotData([[ld - data.movingAvgLoudness], 0, [data.maxLoudness - data.movingAvgLoudness]], 3, true); //[data.movingAvgLoudness]
605+
});
602606
}else if (data.voiceActivity != undefined){
603607
plotData(data.voiceActivity, 3);
604608
}
@@ -628,12 +632,14 @@ <h1>SEPIA Web Audio Processor</h1>
628632
console.log("vadSequenceCode=6", data);
629633
addTitleToPage("VAD result group");
630634
if (data.loudnessProfile){
631-
plotData(data.loudnessProfile);
635+
//plotData(data.loudnessProfile);
636+
plotData([data.loudnessProfile, createArrayWithStartValue(data.loudnessProfile.length, vadWorkerVadThreshold)], undefined, true);
632637
}
633638
if (data.featuresArray){
634639
plotData(data.featuresArray[0]);
635640
plotData(data.featuresArray[1]);
636-
plotData(data.featuresArray[2]);
641+
//plotData(data.featuresArray[2]);
642+
drawHeatmap(data.featuresArray[2], undefined, data.featuresArray[2].length);
637643
}
638644
if (data.mfccProfile){
639645
drawHeatmap(data.mfccProfile, undefined, data.mfccProfile.length);
@@ -802,14 +808,18 @@ <h1>SEPIA Web Audio Processor</h1>
802808
useWaveEncoder(doUseWaveEncoder);
803809

804810
var doUseVadWorker = document.getElementById("useVadWorker").checked;
805-
var vadWorkerBufferSize = +document.getElementById("vadWorkerBufferSize").value; //e.g. 960;
806-
var vadWorkerVadMode = +document.getElementById("vadWorkerVadMode").value; //e.g. 3;
811+
var vadWorkerBufferSize = +document.getElementById("vadWorkerBufferSize").value; //e.g. 512 (Meyda) or 960 (WebRTC);
812+
var vadWorkerVadMode = +document.getElementById("vadWorkerVadMode").value; //e.g. 3;
813+
var vadWorkerVadThreshold = +document.getElementById("vadWorkerVadThreshold").value; //e.g. 4 (Meyda only, mode 1: 1.3, mode 2-n: 3-4;
807814
function setVadWorkerBufferSize(val){
808815
vadWorkerBufferSize = val;
809816
}
810817
function setVadWorkerVadMode(val){
811818
vadWorkerVadMode = val;
812819
}
820+
function setVadWorkerVadThreshold(val){
821+
vadWorkerVadThreshold = val;
822+
}
813823
function useVadWorker(useIt){
814824
doUseVadWorker = useIt;
815825
if (useIt){

0 commit comments

Comments
 (0)