@@ -54,13 +54,16 @@ let _vadFrames; //each frame processes one chunk of '_vadBufferSize' as long
5454let _vadFrameTimeMs ; //real time (ms) of one vadFrame (defined by sample-rate and buffer size)
5555let _vadBufferSize ; //size of a single vadFrame (restrictions apply)
5656let _vadBuffer ;
57- let _previousVadBuffer ;
57+ // let _previousVadBuffer;
5858let _transferFun ;
5959
6060//parameters to calculate vad
61- let _movingAvgLoudness ;
62- let _maxLoudness = 0 ;
61+ let movingAvgLoudness ;
62+ let maxLoudness = 0 ;
6363let _movingAvgLoudnessWeight = 800 ; //TODO: make variable and normalize with sample-rate
64+ let _vadThreshold ;
65+ let _warmUpFrames ;
66+ let _totalFrames ;
6467
6568//sequence control
6669let useSequenceAnalyzer = false ;
@@ -69,11 +72,17 @@ let voiceResetTime;
6972let silenceActivationTime ;
7073let maxSequenceTime ;
7174let minSequenceTime ;
75+ let sequenceTimeForTrigger = 1100 ; //TODO: make variable
76+ let mfccSequenceStartBuffer ;
77+ let loudnessSequenceStartBuffer ;
78+ let feature1SequenceStartBuffer ;
79+ let feature2SequenceStartBuffer ;
80+ let feature3SequenceStartBuffer ;
7281
7382let _sequenceVoiceTime ;
7483let _sequenceSilenceTime ;
7584let _sequenceSawVoice , _sequenceSawSilenceAfterVoice , _sequenceFinishedVoice ;
76- let _sequenceIsActive , _sequenceIsDone , _sequenceStartedAt ;
85+ let _sequenceIsActive , _sequenceIsDone , _sequenceStartedAt , _sequenceCheckedTrigger ;
7786
7887let _isFirstValidProcess ;
7988
@@ -86,7 +95,7 @@ function init(){
8695 if ( inputSampleRate < 8000 || inputSampleRate > 48000 ) {
8796 throw JSON . stringify ( new SampleRateException ( "For this module sample-rate has to be between 8000 and 48000 Hz." ) ) ;
8897 }
89- var allowedBufferSizes = [ 8192 , 4096 , 2048 , 1024 , 512 , 256 , 128 ] ; //recommended: 10-30ms frame length, e.g. 256 /16000 = 16ms
98+ var allowedBufferSizes = [ 8192 , 4096 , 2048 , 1024 , 512 , 256 , 128 ] ; //recommended: 10-30ms frame length, e.g. 512 /16000 = 32ms (recommended)
9099 _vadBufferSize = 0 ;
91100 for ( let i = 0 ; i < allowedBufferSizes . length ; i ++ ) {
92101 if ( processBufferSize == allowedBufferSizes [ i ] || processBufferSize % allowedBufferSizes [ i ] == 0 ) {
@@ -107,7 +116,7 @@ function init(){
107116 var ringBufferSize = processBufferSize + inputSampleSize ; //TODO: check size again
108117 _processRingBuffer = new RingBuffer ( ringBufferSize , channelCount , "Float32" ) ;
109118 _vadBuffer = [ new Float32Array ( _vadBufferSize ) ] ;
110- _previousVadBuffer = [ new Float32Array ( _vadBufferSize ) ] ;
119+ // _previousVadBuffer = [new Float32Array(_vadBufferSize)];
111120 if ( isFloat32Input ) {
112121 //we need flot32 for Meyda so this is all good
113122 _transferFun = function ( thisArray , channel , i ) {
@@ -120,6 +129,19 @@ function init(){
120129 }
121130 }
122131
132+ movingAvgLoudness = undefined ;
133+ maxLoudness = 0 ;
134+
135+ if ( useSequenceAnalyzer ) {
136+ //Buffer the start of a sequence to analyze for keywords/trigger/wake-words etc.
137+ let sequenceStartFrames = Math . round ( ( sequenceTimeForTrigger + 1000 ) / 1000 * ( inputSampleRate / inputSampleSize ) ) ; //TODO: why is this almost 2 times more than expected?
138+ mfccSequenceStartBuffer = ArrayOps . newCommon2dArray ( sequenceStartFrames , Meyda . numberOfMFCCCoefficients , 0 ) ;
139+ loudnessSequenceStartBuffer = ArrayOps . newCommon1dArray ( sequenceStartFrames , 0 ) ;
140+ feature1SequenceStartBuffer = ArrayOps . newCommon1dArray ( sequenceStartFrames , 0 ) ;
141+ feature2SequenceStartBuffer = ArrayOps . newCommon1dArray ( sequenceStartFrames , 0 ) ;
142+ feature3SequenceStartBuffer = ArrayOps . newCommon1dArray ( sequenceStartFrames , 0 ) ;
143+ }
144+
123145 resetSequence ( ) ;
124146
125147 _isFirstValidProcess = true ;
@@ -150,6 +172,10 @@ function constructWorker(options) {
150172 inputSampleSize = options . setup . inputSampleSize || 512 ;
151173 processBufferSize = options . setup . bufferSize || inputSampleSize ;
152174 vadMode = ( options . setup . vadMode != undefined ) ? options . setup . vadMode : 3 ;
175+ _vadThreshold = ( 1 + vadMode / 10 ) ;
176+ _warmUpFrames = Math . round ( 2 * inputSampleRate / inputSampleSize ) ; //input- or processBufferSize? We want ~2s so input makes sense
177+ _totalFrames = 0 ;
178+
153179 isFloat32Input = ( options . setup . isFloat32 != undefined ) ? options . setup . isFloat32 : false ;
154180
155181 if ( options . setup . voiceEnergyCap != undefined ) voiceEnergyCap = options . setup . voiceEnergyCap ;
@@ -161,6 +187,7 @@ function constructWorker(options) {
161187 silenceActivationTime = options . setup . sequence . silenceActivationTime || 250 ;
162188 maxSequenceTime = options . setup . sequence . maxSequenceTime || 6000 ;
163189 minSequenceTime = options . setup . sequence . minSequenceTime || 600 ;
190+ //TODO: add sequenceTimeForTrigger
164191 } else {
165192 useSequenceAnalyzer = false ;
166193 }
@@ -170,7 +197,7 @@ function constructWorker(options) {
170197 Meyda . numberOfMFCCCoefficients = 13 ;
171198 Meyda . windowingFunction = "hanning" ; //"hamming"
172199 var meydaRequiredFeatures = [ "mfcc" , "loudness" ] ;
173- //https://meyda.js.org/audio-features.html: "spectralCentroid", "spectralFlatness", "spectralFlux" (requires previous spec.)
200+ //https://meyda.js.org/audio-features.html: "spectralCentroid", "spectralFlatness", "spectralFlux" (requires previous spec. but is buggy!? )
174201 var meydaAnalyzer = options . setup . meydaAnalyzer || { } ;
175202 var meydaFeatures = [ ] ;
176203 var meydaSettingsKeys = Object . keys ( meydaAnalyzer ) ;
@@ -207,11 +234,11 @@ function constructWorker(options) {
207234
208235//averages
209236function getWeightedMovingAverage ( prevAvg , nextValue , weight ) {
210- if ( prevAvg == undefined ) {
211- return nextValue ;
212- } else {
237+ // if (prevAvg == undefined){
238+ // return nextValue;
239+ // }else{
213240 return ( prevAvg + ( nextValue - prevAvg ) / weight ) ;
214- }
241+ // }
215242}
216243
217244//sequence block
@@ -250,9 +277,16 @@ function sequenceDetector(voiceActivity){
250277 registerEvent ( 3 , 'finished_voice' ) ;
251278
252279 } else if ( _sequenceSawVoice ) {
253- if ( _sequenceIsActive && ( ( Date . now ( ) - _sequenceStartedAt ) > maxSequenceTime ) ) {
254- _sequenceIsDone = true ;
255- registerEvent ( 4 , 'finished_voice_maxtime' ) ;
280+ if ( _sequenceIsActive ) {
281+ let timePassed = ( Date . now ( ) - _sequenceStartedAt ) ;
282+ if ( timePassed > maxSequenceTime ) {
283+ _sequenceIsDone = true ;
284+ registerEvent ( 4 , 'finished_voice_maxtime' ) ;
285+ } else if ( ! _sequenceCheckedTrigger && timePassed >= sequenceTimeForTrigger ) {
286+ sequenceTriggerAnalyzer ( ) ;
287+ _sequenceCheckedTrigger = true ;
288+ registerEvent ( 6 , 'sequence_trigger_result' ) ;
289+ }
256290 }
257291 }
258292
@@ -261,6 +295,33 @@ function sequenceDetector(voiceActivity){
261295 resetSequence ( ) ;
262296 }
263297}
298+ function sequenceTriggerAnalyzer ( ) {
299+ //TODO
300+ var sqVoiceActivity = feature1SequenceStartBuffer ;
301+ var sqVoiceEnergy = feature2SequenceStartBuffer ;
302+ var rangeStart = 0 ;
303+ var rangeEnd = sqVoiceEnergy . length ;
304+ //find largest activity block - TODO: not good enough, 0 is possible (try 'hey computer')
305+ var bestRange = { start : 0 , end : 0 , range : 0 } ;
306+ var nextRange = { start : 0 , end : 0 , range : 0 } ;
307+ for ( let i = 0 ; i < sqVoiceEnergy . length ; i ++ ) {
308+ if ( nextRange . range == 0 && sqVoiceEnergy [ i ] > 0 ) {
309+ nextRange . start = i ;
310+ nextRange . range ++ ;
311+ } else if ( sqVoiceEnergy [ i ] > 0 ) {
312+ nextRange . range ++ ;
313+ } else {
314+ nextRange . end = i ;
315+ if ( nextRange . range > bestRange . range ) {
316+ bestRange = nextRange ;
317+ nextRange = { start : 0 , end : 0 , range : 0 } ;
318+ }
319+ }
320+ }
321+ if ( bestRange . range ) {
322+ //var reducedMfccBuffer = mfccSequenceStartBuffer.slice(bestRange.start, bestRange.end);
323+ }
324+ }
264325function resetSequence ( ) {
265326 //vad
266327 voiceEnergy = 0 ;
@@ -272,6 +333,7 @@ function resetSequence(){
272333 _sequenceIsActive = false ;
273334 _sequenceStartedAt = 0 ;
274335 _sequenceIsDone = false ;
336+ _sequenceCheckedTrigger = false ;
275337}
276338function registerEvent ( code , msg , data ) {
277339 var msg = {
@@ -287,6 +349,13 @@ function registerEvent(code, msg, data){
287349 msg . vadSequenceStarted = _sequenceStartedAt ;
288350 msg . vadSequenceEnded = Date . now ( ) ;
289351 break ;
352+ case 6 :
353+ //sequence trigger-check phase data
354+ msg . vadSequenceStarted = _sequenceStartedAt ;
355+ msg . mfccProfile = mfccSequenceStartBuffer ;
356+ msg . loudnessProfile = loudnessSequenceStartBuffer ;
357+ msg . featuresArray = [ feature1SequenceStartBuffer , feature2SequenceStartBuffer , feature3SequenceStartBuffer ] ;
358+ msg . avgLoudness = movingAvgLoudness ;
290359 default :
291360 break ;
292361 }
@@ -301,6 +370,7 @@ function process(data) {
301370 //Use 1st input and output only
302371 let input = data . samples ;
303372 let thisInputSampleSize = input [ 0 ] . length ;
373+ _totalFrames ++ ;
304374
305375 if ( _isFirstValidProcess ) {
306376 _isFirstValidProcess = false ;
@@ -326,30 +396,41 @@ function process(data) {
326396 _processRingBuffer . push ( input , _transferFun ) ;
327397
328398 //Process if we have enough frames
329- var vadResults = [ ] ;
330- var loudnessResults = [ ] ;
331- var mfcc = [ ] ;
399+ let vadFramesAvailable = Math . floor ( _processRingBuffer . framesAvailable / _vadBufferSize ) ;
400+ let vadResults = new Array ( vadFramesAvailable ) ;
401+ let loudnessResults = new Array ( vadFramesAvailable ) ;
402+ let mfcc = new Array ( vadFramesAvailable ) ;
403+ //let moreFeatures = new Array(vadFramesAvailable);
404+ let n = 0 ;
332405 while ( _processRingBuffer . framesAvailable >= _vadBufferSize ) {
333406 //pull samples
334407 _processRingBuffer . pull ( _vadBuffer ) ;
335408
336409 //Meyda features
337- let features = Meyda . extract ( Meyda . features , _vadBuffer [ 0 ] , _previousVadBuffer [ 0 ] ) ;
410+ let features = Meyda . extract ( Meyda . features , _vadBuffer [ 0 ] ) ; //we don't add ' _previousVadBuffer[0]' because it saves time and 'spectralFlux' is buggy
338411 //console.log("features_meyda", features);
339- _previousVadBuffer = _vadBuffer ;
340412
341413 //let loudness = (features.loudness.specific[1] + features.loudness.specific[2] + features.loudness.specific[3]); //'specific' shows each loudness on bark scale, 'total' is the sum
342414 //let loudness = features.loudness.total;
343415 let loudness = features . loudness . specific . slice ( 1 , 5 ) . reduce ( function ( a , b ) { return a + b ; } ) ;
344- _maxLoudness = Math . max ( _maxLoudness , loudness ) ;
345- _movingAvgLoudness = getWeightedMovingAverage ( _movingAvgLoudness , loudness , _movingAvgLoudnessWeight ) ;
416+ maxLoudness = Math . max ( maxLoudness , loudness ) ;
417+ if ( movingAvgLoudness == undefined ) {
418+ movingAvgLoudness = loudness * _vadThreshold ;
419+ }
420+ if ( _totalFrames < _warmUpFrames ) {
421+ movingAvgLoudness = getWeightedMovingAverage ( movingAvgLoudness , loudness , 10 ) ;
422+ } else {
423+ movingAvgLoudness = getWeightedMovingAverage ( movingAvgLoudness , loudness , _movingAvgLoudnessWeight ) ;
424+ }
346425
347- mfcc . push ( features . mfcc ) ;
426+ mfcc [ n ] = features . mfcc ;
348427
349428 //activity check
350- var voiceActivity = ( loudness / _movingAvgLoudness ) > ( 1 + vadMode / 10 ) ? 1 : 0 ;
351- vadResults . push ( voiceActivity ) ;
352- loudnessResults . push ( loudness ) ;
429+ let loudnessNorm = ( loudness / movingAvgLoudness ) ;
430+ let voiceActivity = loudnessNorm > _vadThreshold ? 1 : 0 ;
431+ vadResults [ n ] = voiceActivity ;
432+ loudnessResults [ n ] = loudness ;
433+ //moreFeatures[n] = [];
353434
354435 //voice energy and sequence check
355436 if ( voiceActivity ) {
@@ -360,19 +441,25 @@ function process(data) {
360441 if ( voiceEnergy < 0 ) voiceEnergy = 0 ;
361442 }
362443 if ( useSequenceAnalyzer ) {
444+ ArrayOps . pushAndShift ( mfccSequenceStartBuffer , features . mfcc ) ;
445+ ArrayOps . pushAndShift ( loudnessSequenceStartBuffer , ( loudnessNorm > 1 ) ? ( loudnessNorm - 1 ) : 0 ) ; //loudness
446+ ArrayOps . pushAndShift ( feature1SequenceStartBuffer , voiceActivity ) ;
447+ ArrayOps . pushAndShift ( feature2SequenceStartBuffer , voiceEnergy ) ;
448+ ArrayOps . pushAndShift ( feature3SequenceStartBuffer , 0 ) ;
363449 sequenceDetector ( voiceActivity ) ;
364450 }
451+ n ++ ;
365452 }
366- if ( vadResults . length > 0 ) {
453+ if ( n > 0 ) {
367454 //Send info
368- //console.log("features", vadResults, loudnessResults, _movingAvgLoudness, _maxLoudness );
455+ //console.log("features", vadResults, loudnessResults, movingAvgLoudness, maxLoudness );
369456 postMessage ( {
370457 voiceActivity : vadResults ,
371458 voiceEnergy : voiceEnergy ,
372459 voiceLoudness : loudnessResults ,
373460 mfcc : mfcc ,
374- movingAvgLoudness : _movingAvgLoudness ,
375- maxLoudness : _maxLoudness
461+ movingAvgLoudness : movingAvgLoudness ,
462+ maxLoudness : maxLoudness
376463 } ) ;
377464 }
378465 }
@@ -400,9 +487,13 @@ function release(options){
400487 //destroy
401488 _processRingBuffer = null ;
402489 _vadBuffer = null ;
403- _previousVadBuffer = null ;
404- _movingAvgLoudness = undefined ;
405- _maxLoudness = 0 ;
490+ //_previousVadBuffer = null;
491+ _totalFrames = 0 ;
492+ mfccSequenceStartBuffer = null ;
493+ loudnessSequenceStartBuffer = null ;
494+ feature1SequenceStartBuffer = null ;
495+ feature2SequenceStartBuffer = null ;
496+ feature3SequenceStartBuffer = null ;
406497}
407498
408499//--- helpers ---
0 commit comments