1
1
#pragma once
2
2
3
+ // Configure FFT to output 16 bit fixed point.
4
+ #define FIXED_POINT 16
5
+
3
6
#include < TensorFlowLite.h>
4
7
5
8
#include < cmath>
@@ -73,7 +76,7 @@ struct TfLiteConfig {
73
76
// the frequency information. This has to be a power of two, and since
74
77
// we're dealing with 30ms of 16KHz inputs, which means 480 samples, this
75
78
// is the next value.
76
- int kMaxAudioSampleSize = 480 ;
79
+ // int kMaxAudioSampleSize = 320; //512; // 480
77
80
int kAudioSampleFrequency = 16000 ;
78
81
79
82
// Number of audio channels - is usually 1. If 2 we reduce it to 1 by averaging the 2 channels
@@ -90,6 +93,8 @@ struct TfLiteConfig {
90
93
int kSlicesToProcess = 3 ;
91
94
92
95
int featureElementCount () { return kFeatureSliceSize * kFeatureSliceCount ; }
96
+ int audioSampleSize () { return kFeatureSliceDurationMs * (kAudioSampleFrequency / 1000 ); }
97
+ int strideSampleSize () {return kFeatureSliceStrideMs * (kAudioSampleFrequency / 1000 );}
93
98
94
99
// Parameters for RecognizeCommands
95
100
int32_t average_window_duration_ms = 1000 ;
@@ -190,7 +195,7 @@ class TfLiteResultsQueue {
190
195
template <int N>
191
196
class TfLiteAbstractRecognizeCommands {
192
197
public:
193
- virtual TfLiteStatus ProcessLatestResults (const TfLiteTensor* latest_results,
198
+ virtual TfLiteStatus processLatestResults (const TfLiteTensor* latest_results,
194
199
const int32_t current_time_ms,
195
200
const char ** found_command, uint8_t * score,
196
201
bool * is_new_command) = 0;
@@ -225,32 +230,41 @@ class TfLiteRecognizeCommands : public TfLiteAbstractRecognizeCommands<N> {
225
230
// further recognitions for a set time after one has been triggered, which can
226
231
// help reduce spurious recognitions.
227
232
228
- explicit TfLiteRecognizeCommands () {
233
+ TfLiteRecognizeCommands () {
229
234
previous_top_label_ = " silence" ;
230
235
previous_top_label_time_ = std::numeric_limits<int32_t >::min ();
231
236
kCategoryCount = N;
232
237
}
233
238
234
239
// / Setup parameters from config
235
240
bool begin (TfLiteConfig cfg) override {
241
+ if (kCategoryCount ==0 ){
242
+ LOGE (" kCategoryCount must not be 0" );
243
+ return false ;
244
+ }
245
+ if (cfg.labels ==nullptr ){
246
+ LOGE (" config.labels not defined" );
247
+ return false ;
248
+ }
236
249
average_window_duration_ms_ = cfg.average_window_duration_ms ;
237
250
detection_threshold_ = cfg.detection_threshold ;
238
251
suppression_ms_ = cfg.suppression_ms ;
239
252
minimum_count_ = cfg.minimum_count ;
240
253
kCategoryLabels = cfg.labels ;
241
- if (cfg.labels ==0 ){
242
- LOGW (" config.labels not defined" );
243
- return false ;
244
- }
254
+ started = true ;
245
255
return true ;
246
256
}
247
257
248
258
// Call this with the results of running a model on sample data.
249
- virtual TfLiteStatus ProcessLatestResults (const TfLiteTensor* latest_results,
259
+ virtual TfLiteStatus processLatestResults (const TfLiteTensor* latest_results,
250
260
const int32_t current_time_ms,
251
261
const char ** found_command, uint8_t * score,
252
262
bool * is_new_command) override {
253
263
LOGD (LOG_METHOD);
264
+ if (!started){
265
+ LOGE (" TfLiteRecognizeCommands not started" );
266
+ return kTfLiteError ;
267
+ }
254
268
if ((latest_results->dims ->size != 2 ) ||
255
269
(latest_results->dims ->data [0 ] != 1 ) ||
256
270
(latest_results->dims ->data [1 ] != kCategoryCount )) {
@@ -359,6 +373,7 @@ class TfLiteRecognizeCommands : public TfLiteAbstractRecognizeCommands<N> {
359
373
int32_t minimum_count_;
360
374
int kCategoryCount ;
361
375
const char ** kCategoryLabels = nullptr ;
376
+ bool started = false ;
362
377
363
378
// Working variables
364
379
TfLiteResultsQueue<N> previous_results_;
@@ -385,13 +400,20 @@ class TfLiteAudioFeatureProvider {
385
400
virtual bool begin (TfLiteConfig config) {
386
401
LOGD (LOG_METHOD);
387
402
cfg = config;
403
+ kMaxAudioSampleSize = cfg.audioSampleSize ();
404
+ kStrideSampleSize = cfg.strideSampleSize ();
405
+ kKeepSampleSize = kMaxAudioSampleSize - kStrideSampleSize ;
406
+
407
+ // Allocate ring buffer
388
408
if (p_buffer == nullptr ) {
389
- p_buffer = new audio_tools::RingBuffer<int16_t >(cfg. kMaxAudioSampleSize );
390
- LOGD (" Allocating buffer for %d samples" , cfg. kMaxAudioSampleSize );
409
+ p_buffer = new audio_tools::RingBuffer<int16_t >(kMaxAudioSampleSize );
410
+ LOGD (" Allocating buffer for %d samples" , kMaxAudioSampleSize );
391
411
}
412
+
392
413
// Initialize the feature data to default values.
393
414
if (feature_data_ == nullptr ) {
394
- feature_data_ = new int8_t [cfg.featureElementCount ()]{}; // initialzed array
415
+ feature_data_ = new int8_t [cfg.featureElementCount ()];
416
+ memset (feature_data_,0 , cfg.featureElementCount ());
395
417
}
396
418
397
419
TfLiteStatus init_status = initializeMicroFeatures ();
@@ -428,15 +450,13 @@ class TfLiteAudioFeatureProvider {
428
450
429
451
protected:
430
452
TfLiteConfig cfg;
431
- // int feature_size_;
432
453
int8_t * feature_data_ = nullptr ;
433
- // Make sure we don't try to use cached information if this is the first
434
- // call into the provider.
435
- bool is_first_run_ = true ;
436
- bool g_is_first_time = true ;
437
- // const char** kCategoryLabels;
438
454
audio_tools::RingBuffer<int16_t >* p_buffer = nullptr ;
439
455
FrontendState g_micro_features_state;
456
+ FrontendConfig config;
457
+ int kMaxAudioSampleSize ;
458
+ int kStrideSampleSize ;
459
+ int kKeepSampleSize ;
440
460
441
461
// If we can avoid recalculating some slices, just move the existing
442
462
// data up in the spectrogram, to perform something like this: last time
@@ -452,26 +472,32 @@ class TfLiteAudioFeatureProvider {
452
472
// +-----------+ +-----------+
453
473
virtual void addSlice () {
454
474
LOGD (LOG_METHOD);
475
+ // shift feature_data_ by one slice one one
455
476
memmove (feature_data_, feature_data_ + cfg.kFeatureSliceSize ,
456
477
(cfg.kFeatureSliceCount - 1 ) * cfg.kFeatureSliceSize );
457
478
458
479
// copy data from buffer to audio_samples
459
- int16_t audio_samples[cfg.kMaxAudioSampleSize ];
460
- int audio_samples_size =
461
- p_buffer->readArray (audio_samples, cfg.kMaxAudioSampleSize );
480
+ int16_t audio_samples[kMaxAudioSampleSize ];
481
+ int audio_samples_size = p_buffer->readArray (audio_samples, kMaxAudioSampleSize );
482
+
483
+ // check size
484
+ if (audio_samples_size!=kMaxAudioSampleSize ){
485
+ LOGE (" audio_samples_size=%d != kMaxAudioSampleSize=%d" ,audio_samples_size, kMaxAudioSampleSize );
486
+ }
462
487
488
+ // keep some data to be reprocessed - move by kStrideSampleSize
489
+ p_buffer->writeArray (audio_samples+kStrideSampleSize , kKeepSampleSize );
463
490
464
491
// the new slice data will always be stored at the end
465
- int8_t * new_slice_data =
466
- feature_data_ + ((cfg.kFeatureSliceCount - 1 ) * cfg.kFeatureSliceSize );
467
- size_t num_samples_read = audio_samples_size;
492
+ int8_t * new_slice_data = feature_data_ + ((cfg.kFeatureSliceCount - 1 ) * cfg.kFeatureSliceSize );
493
+ size_t num_samples_read = 0 ;
468
494
if (generateMicroFeatures (audio_samples, audio_samples_size,
469
- cfg.kFeatureSliceSize , new_slice_data,
495
+ new_slice_data, cfg.kFeatureSliceSize ,
470
496
&num_samples_read) != kTfLiteOk ) {
471
497
LOGE (" Error generateMicroFeatures" );
472
498
}
473
499
474
- // printFeatures();
500
+ // printFeatures();
475
501
}
476
502
477
503
// / For debugging: print feature matrix
@@ -483,11 +509,11 @@ class TfLiteAudioFeatureProvider {
483
509
}
484
510
Serial.println ();
485
511
}
512
+ Serial.println (" ------------" );
486
513
}
487
514
488
515
virtual TfLiteStatus initializeMicroFeatures () {
489
516
LOGD (LOG_METHOD);
490
- FrontendConfig config;
491
517
config.window .size_ms = cfg.kFeatureSliceDurationMs ;
492
518
config.window .step_size_ms = cfg.kFeatureSliceStrideMs ;
493
519
config.noise_reduction .smoothing_bits = 10 ;
@@ -506,38 +532,42 @@ class TfLiteAudioFeatureProvider {
506
532
config.log_scale .scale_shift = 6 ;
507
533
if (!FrontendPopulateState (&config, &g_micro_features_state,
508
534
cfg.kAudioSampleFrequency )) {
509
- LOGE (" FrontendPopulateState () failed" );
535
+ LOGE (" frontendPopulateState () failed" );
510
536
return kTfLiteError ;
511
537
}
512
- g_is_first_time = true ;
513
538
return kTfLiteOk ;
514
539
}
515
540
516
- // This is not exposed in any header, and is only used for testing, to ensure
517
- // that the state is correctly set up before generating results.
518
- void setMicroFeaturesNoiseEstimates (const uint32_t * estimate_presets) {
519
- LOGD (LOG_METHOD);
520
- for (int i = 0 ; i < g_micro_features_state.filterbank .num_channels ; ++i) {
521
- g_micro_features_state.noise_reduction .estimate [i] = estimate_presets[i];
522
- }
523
- }
541
+ // // This is not exposed in any header, and is only used for testing, to ensure
542
+ // // that the state is correctly set up before generating results.
543
+ // void setMicroFeaturesNoiseEstimates(const uint32_t* estimate_presets) {
544
+ // LOGD(LOG_METHOD);
545
+ // for (int i = 0; i < g_micro_features_state.filterbank.num_channels; ++i) {
546
+ // g_micro_features_state.noise_reduction.estimate[i] = estimate_presets[i];
547
+ // }
548
+ // }
524
549
525
550
virtual TfLiteStatus generateMicroFeatures (const int16_t * input, int input_size,
526
- int output_size, int8_t * output,
551
+ int8_t * output, int output_size,
527
552
size_t * num_samples_read) {
528
553
LOGD (LOG_METHOD);
529
- const int16_t * frontend_input;
530
- if (g_is_first_time) {
531
- frontend_input = input;
532
- g_is_first_time = false ;
533
- } else {
534
- frontend_input = input;
535
- }
554
+ const int16_t * frontend_input=input;
536
555
537
556
// Apply FFT
538
557
FrontendOutput frontend_output = FrontendProcessSamples (
539
558
&g_micro_features_state, frontend_input, input_size, num_samples_read);
540
559
560
+ // Check size
561
+ if (output_size != frontend_output.size ){
562
+ LOGE (" output_size=%d, frontend_output.size=%d" ,output_size, frontend_output.size );
563
+ }
564
+
565
+ // // check generated features
566
+ // if (input_size != *num_samples_read){
567
+ // LOGE("audio_samples_size=%d vs num_samples_read=%d", input_size, *num_samples_read);
568
+ // }
569
+
570
+
541
571
for (size_t i = 0 ; i < frontend_output.size ; ++i) {
542
572
// These scaling values are derived from those used in input_data.py in
543
573
// the training pipeline. The feature pipeline outputs 16-bit signed
@@ -675,7 +705,7 @@ class TfLiteAudioOutput : public AudioPrint {
675
705
// we submit int16 data which will be reduced to 8bits so we can send
676
706
// double the amount - 2 channels will be recuced to 1 so we multiply by
677
707
// number of channels
678
- int maxBytes = cfg.kMaxAudioSampleSize * 2 * cfg.kAudioChannels ;
708
+ int maxBytes = cfg.audioSampleSize () * 2 * cfg.kAudioChannels ;
679
709
while (open > 0 ) {
680
710
int len = min (open, maxBytes);
681
711
result += processAudio (audio + pos, len);
@@ -810,10 +840,10 @@ class TfLiteAudioOutput : public AudioPrint {
810
840
uint8_t score = 0 ;
811
841
bool is_new_command = false ;
812
842
813
- TfLiteStatus process_status = recognizer->ProcessLatestResults (
843
+ TfLiteStatus process_status = recognizer->processLatestResults (
814
844
output, current_time, &found_command, &score, &is_new_command);
815
845
if (process_status != kTfLiteOk ) {
816
- LOGE (" TfLiteRecognizeCommands::ProcessLatestResults () failed" );
846
+ LOGE (" TfLiteRecognizeCommands::processLatestResults () failed" );
817
847
return 0 ;
818
848
}
819
849
// Do something based on the recognized command. The default
0 commit comments