@@ -42,10 +42,16 @@ let channelCount;
4242let inputSampleSize ;
4343let processBufferSize ; //defines '_processRingBuffer' size together with 'inputSampleSize'
4444let vadMode ;
45+ let vadThreshold ;
46+ let vadDefaultThresholds = {
47+ 1 : 1.5 ,
48+ 2 : 3.5 ,
49+ 3 : 3.5
50+ }
4551let isFloat32Input ; //default false
4652
4753let voiceEnergy ;
48- let voiceEnergyCap = 50 ;
54+ let voiceEnergyCap = 42 ;
4955let voiceEnergyDropRate = 2 ;
5056let _samplesToTimeMsFactor ;
5157
@@ -60,8 +66,9 @@ let _transferFun;
6066//parameters to calculate vad
6167let movingAvgLoudness ;
6268let maxLoudness = 0 ;
63- let _movingAvgLoudnessWeight = 800 ; //TODO: make variable and normalize with sample-rate
64- let _vadThreshold ;
69+ let _movingAvgLoudnessWeight = 400 ; //TODO: make variable and normalize with sample-rate
70+ let _mfccDynamicWeightsArray ;
71+ let _mfccLastArray ;
6572let _warmUpFrames ;
6673let _totalFrames ;
6774
@@ -98,15 +105,22 @@ function init(){
98105 var allowedBufferSizes = [ 8192 , 4096 , 2048 , 1024 , 512 , 256 , 128 ] ; //recommended: 10-30ms frame length, e.g. 512/16000 = 32ms (recommended)
99106 _vadBufferSize = 0 ;
100107 for ( let i = 0 ; i < allowedBufferSizes . length ; i ++ ) {
101- if ( processBufferSize == allowedBufferSizes [ i ] || processBufferSize % allowedBufferSizes [ i ] == 0 ) {
102- _vadFrames = processBufferSize / allowedBufferSizes [ i ] ;
103- _vadBufferSize = allowedBufferSizes [ i ] ;
104- break ;
108+ //common cases
109+ if ( inputSampleRate == 16000 && processBufferSize >= 512 ) {
110+ _vadBufferSize = 512 ;
111+
112+ //best fallback
113+ } else {
114+ if ( processBufferSize == allowedBufferSizes [ i ] || processBufferSize % allowedBufferSizes [ i ] == 0 ) {
115+ _vadBufferSize = allowedBufferSizes [ i ] ;
116+ break ;
117+ }
105118 }
106119 }
107120 if ( _vadBufferSize == 0 ) {
108121 throw JSON . stringify ( new BufferSizeException ( "The 'bufferSize' has to be equal or a multiple of: " + allowedBufferSizes . join ( ", " ) ) ) ;
109122 } else {
123+ _vadFrames = processBufferSize / _vadBufferSize ;
110124 _samplesToTimeMsFactor = 1000 / inputSampleRate ;
111125 _vadFrameTimeMs = Math . round ( _vadBufferSize * _samplesToTimeMsFactor ) ;
112126 if ( _vadFrameTimeMs < 5 || _vadFrameTimeMs > 86 ) {
@@ -129,12 +143,21 @@ function init(){
129143 }
130144 }
131145
146+ //Meyda requirements (2)
147+ Meyda . sampleRate = inputSampleRate ;
148+ Meyda . bufferSize = _vadBufferSize ;
149+ if ( ! Meyda . bufferSize || ( Meyda . bufferSize & ( Meyda . bufferSize - 1 ) != 0 ) ) {
150+ throw JSON . stringify ( { name : "VadModuleMeydaError" , message : "Meyda buffer-size must be power of 2, e.g. 128, 256, 512, 1024, ..." } ) ;
151+ }
152+
132153 movingAvgLoudness = undefined ;
133154 maxLoudness = 0 ;
155+ _mfccDynamicWeightsArray = ArrayOps . newCommon1dArray ( Meyda . numberOfMFCCCoefficients , 1 ) ;
156+ _mfccLastArray = ArrayOps . newCommon1dArray ( Meyda . numberOfMFCCCoefficients , 0 ) ;
134157
135158 if ( useSequenceAnalyzer ) {
136159 //Buffer the start of a sequence to analyze for keywords/trigger/wake-words etc.
137- let sequenceStartFrames = Math . round ( ( sequenceTimeForTrigger + 1000 ) / 1000 * ( inputSampleRate / inputSampleSize ) ) ; //TODO: why is this almost 2 times more than expected?
160+ let sequenceStartFrames = Math . round ( ( sequenceTimeForTrigger + 1000 ) / 1000 * ( inputSampleRate / _vadBufferSize ) ) ; //TODO: why is this almost 2 times more than expected?
138161 mfccSequenceStartBuffer = ArrayOps . newCommon2dArray ( sequenceStartFrames , Meyda . numberOfMFCCCoefficients , 0 ) ;
139162 loudnessSequenceStartBuffer = ArrayOps . newCommon1dArray ( sequenceStartFrames , 0 ) ;
140163 feature1SequenceStartBuffer = ArrayOps . newCommon1dArray ( sequenceStartFrames , 0 ) ;
@@ -156,6 +179,7 @@ function ready(){
156179 inputIsFloat32 : isFloat32Input ,
157180 processBufferSize : processBufferSize ,
158181 vadMode : vadMode ,
182+ vadThreshold : vadThreshold ,
159183 vadFramesMax : _vadFrames ,
160184 vadBufferSize : _vadBufferSize ,
161185 vadFrameTimeMs : _vadFrameTimeMs ,
@@ -171,8 +195,8 @@ function constructWorker(options) {
171195 channelCount = 1 ; //options.setup.channelCount || 1; //TODO: only MONO atm
172196 inputSampleSize = options . setup . inputSampleSize || 512 ;
173197 processBufferSize = options . setup . bufferSize || inputSampleSize ;
174- vadMode = ( options . setup . vadMode != undefined ) ? options . setup . vadMode : 3 ;
175- _vadThreshold = ( 1 + vadMode / 10 ) ;
198+ vadMode = options . setup . vadMode || 3 ;
199+ vadThreshold = options . setup . vadThreshold || vadDefaultThresholds [ vadMode ] || 3 ;
176200 _warmUpFrames = Math . round ( 2 * inputSampleRate / inputSampleSize ) ; //input- or processBufferSize? We want ~2s so input makes sense
177201 _totalFrames = 0 ;
178202
@@ -193,8 +217,8 @@ function constructWorker(options) {
193217 }
194218
195219 //Meyda options and defaults
196- Meyda . melBands = 26 ;
197- Meyda . numberOfMFCCCoefficients = 13 ;
220+ Meyda . melBands = 40 ; //40 26;
221+ Meyda . numberOfMFCCCoefficients = 20 ; // 13;
198222 Meyda . windowingFunction = "hanning" ; //"hamming"
199223 var meydaRequiredFeatures = [ "mfcc" , "loudness" ] ;
200224 //https://meyda.js.org/audio-features.html: "spectralCentroid", "spectralFlatness", "spectralFlux" (requires previous spec. but is buggy!?)
@@ -211,12 +235,7 @@ function constructWorker(options) {
211235 }
212236 } ) ;
213237 }
214- //Meyda requirements
215- Meyda . sampleRate = inputSampleRate ;
216- Meyda . bufferSize = inputSampleSize ;
217- if ( ! Meyda . bufferSize || ( Meyda . bufferSize & ( Meyda . bufferSize - 1 ) != 0 ) ) {
218- throw { name : "VadModuleMeydaError" , message : "Meyda buffer-size must be power of 2, e.g. 128, 256, 512, 1024, ..." } ;
219- }
238+ //Meyda requirements (1)
220239 if ( ! meydaFeatures ) {
221240 Meyda . features = meydaRequiredFeatures ;
222241 } else {
@@ -335,11 +354,11 @@ function resetSequence(){
335354 _sequenceIsDone = false ;
336355 _sequenceCheckedTrigger = false ;
337356}
338- function registerEvent ( code , msg , data ) {
357+ function registerEvent ( code , _msg , data ) {
339358 var msg = {
340359 vadSequenceCode : code ,
341- vadSequenceMsg : msg
342- } ;
360+ vadSequenceMsg : _msg
361+ }
343362 switch ( code ) {
344363 //case 1: voice start
345364 //case 2: sequence start
@@ -363,6 +382,27 @@ function registerEvent(code, msg, data){
363382 postMessage ( msg ) ;
364383}
365384
385+ //classify voice activity
386+ function getVoiceActivity ( mfccArray , loudnessNorm , averageLoudness ) {
387+ if ( vadMode == 3 ) {
388+ var sum = 0 ;
389+ for ( let i = 0 ; i < mfccArray . length ; i ++ ) {
390+ let change = Math . abs ( mfccArray [ i ] / _mfccLastArray [ i ] - 1 ) ;
391+ if ( change < 0.20 ) {
392+ _mfccDynamicWeightsArray [ i ] = _mfccDynamicWeightsArray [ i ] * 0.66 ;
393+ } else {
394+ _mfccDynamicWeightsArray [ i ] = 1.0 ; //Math.min(1.0, _mfccDynamicWeightsArray[i] + 0.75);
395+ }
396+ _mfccLastArray [ i ] = mfccArray [ i ] ;
397+ sum += Math . abs ( mfccArray [ i ] * _mfccDynamicWeightsArray [ i ] ) ;
398+ }
399+ var signal = sum / mfccArray . length - averageLoudness ;
400+ return ( signal > vadThreshold ? 1 : 0 ) ;
401+ } else {
402+ return ( loudnessNorm > vadThreshold ? 1 : 0 ) ;
403+ }
404+ }
405+
366406function process ( data ) {
367407 //expected: data.samples, data.sampleRate, data.channels, data.type
368408 //might have: data.rms - TODO: make use of?
@@ -402,34 +442,38 @@ function process(data) {
402442 let mfcc = new Array ( vadFramesAvailable ) ;
403443 //let moreFeatures = new Array(vadFramesAvailable);
404444 let n = 0 ;
405- while ( _processRingBuffer . framesAvailable >= _vadBufferSize ) {
445+ while ( _processRingBuffer . framesAvailable >= _vadBufferSize ) {
406446 //pull samples
407447 _processRingBuffer . pull ( _vadBuffer ) ;
408448
409449 //Meyda features
410450 let features = Meyda . extract ( Meyda . features , _vadBuffer [ 0 ] ) ; //we don't add '_previousVadBuffer[0]' because it saves time and 'spectralFlux' is buggy
411451 //console.log("features_meyda", features);
412452
413- //let loudness = (features.loudness.specific[1] + features.loudness.specific[2] + features.loudness.specific[3]); //'specific' shows each loudness on bark scale, 'total' is the sum
414- //let loudness = features.loudness.total;
415- let loudness = features . loudness . specific . slice ( 1 , 5 ) . reduce ( function ( a , b ) { return a + b ; } ) ;
453+ let loudness ;
454+ //loudness = (features.loudness.specific[1] + features.loudness.specific[2] + features.loudness.specific[3]); //'specific' shows each loudness on bark scale, 'total' is the sum
455+ //loudness = features.loudness.total;
456+ if ( vadMode == 1 ) {
457+ loudness = features . loudness . specific . slice ( 1 , 5 ) . reduce ( function ( a , b ) { return a + b ; } ) ; //1-5 on the bark scale
458+ } else {
459+ loudness = features . mfcc . reduce ( function ( a , c ) { return ( a + Math . abs ( c ) ) ; } ) / features . mfcc . length ; //avg(...abs(MFCC[i]))
460+ }
416461 maxLoudness = Math . max ( maxLoudness , loudness ) ;
417462 if ( movingAvgLoudness == undefined ) {
418- movingAvgLoudness = loudness * _vadThreshold ;
463+ movingAvgLoudness = loudness * vadThreshold ;
419464 }
420465 if ( _totalFrames < _warmUpFrames ) {
421466 movingAvgLoudness = getWeightedMovingAverage ( movingAvgLoudness , loudness , 10 ) ;
422467 } else {
423468 movingAvgLoudness = getWeightedMovingAverage ( movingAvgLoudness , loudness , _movingAvgLoudnessWeight ) ;
424469 }
425470
426- mfcc [ n ] = features . mfcc ;
427-
428471 //activity check
429- let loudnessNorm = ( loudness / movingAvgLoudness ) ;
430- let voiceActivity = loudnessNorm > _vadThreshold ? 1 : 0 ;
472+ let loudnessNorm = ( loudness - movingAvgLoudness ) ;
473+ let voiceActivity = getVoiceActivity ( features . mfcc , loudnessNorm , movingAvgLoudness ) ;
431474 vadResults [ n ] = voiceActivity ;
432475 loudnessResults [ n ] = loudness ;
476+ mfcc [ n ] = features . mfcc ;
433477 //moreFeatures[n] = [];
434478
435479 //voice energy and sequence check
@@ -442,10 +486,10 @@ function process(data) {
442486 }
443487 if ( useSequenceAnalyzer ) {
444488 ArrayOps . pushAndShift ( mfccSequenceStartBuffer , features . mfcc ) ;
445- ArrayOps . pushAndShift ( loudnessSequenceStartBuffer , ( loudnessNorm > 1 ) ? ( loudnessNorm - 1 ) : 0 ) ; //loudness
489+ ArrayOps . pushAndShift ( loudnessSequenceStartBuffer , loudnessNorm ) ; // (loudnessNorm > 1)? (loudnessNorm-1) : 0); //loudness
446490 ArrayOps . pushAndShift ( feature1SequenceStartBuffer , voiceActivity ) ;
447491 ArrayOps . pushAndShift ( feature2SequenceStartBuffer , voiceEnergy ) ;
448- ArrayOps . pushAndShift ( feature3SequenceStartBuffer , 0 ) ;
492+ ArrayOps . pushAndShift ( feature3SequenceStartBuffer , features . loudness . specific ) ;
449493 sequenceDetector ( voiceActivity ) ;
450494 }
451495 n ++ ;
@@ -490,6 +534,7 @@ function release(options){
490534 //_previousVadBuffer = null;
491535 _totalFrames = 0 ;
492536 mfccSequenceStartBuffer = null ;
537+ _mfccDynamicWeightsArray = null ;
493538 loudnessSequenceStartBuffer = null ;
494539 feature1SequenceStartBuffer = null ;
495540 feature2SequenceStartBuffer = null ;
0 commit comments