1
1
#pragma once
2
2
3
+ // Configure FFT to output 16 bit fixed point.
4
+ #define FIXED_POINT 16
5
+
3
6
#include <TensorFlowLite.h>
4
7
5
8
#include <cmath>
@@ -73,7 +76,7 @@ struct TfLiteConfig {
73
76
// the frequency information. This has to be a power of two, and since
74
77
// we're dealing with 30ms of 16KHz inputs, which means 480 samples, this
75
78
// is the next value.
76
- int kMaxAudioSampleSize = 480;
79
+ // int kMaxAudioSampleSize = 320; //512; // 480
77
80
int kAudioSampleFrequency = 16000;
78
81
79
82
// Number of audio channels - is usually 1. If 2 we reduce it to 1 by averaging the 2 channels
@@ -90,6 +93,8 @@ struct TfLiteConfig {
90
93
int kSlicesToProcess = 3;
91
94
92
95
int featureElementCount() { return kFeatureSliceSize * kFeatureSliceCount; }
96
+ int audioSampleSize() { return kFeatureSliceDurationMs * (kAudioSampleFrequency / 1000); }
97
+ int strideSampleSize() {return kFeatureSliceStrideMs * (kAudioSampleFrequency / 1000);}
93
98
94
99
// Parameters for RecognizeCommands
95
100
int32_t average_window_duration_ms = 1000;
@@ -190,7 +195,7 @@ class TfLiteResultsQueue {
190
195
template <int N>
191
196
class TfLiteAbstractRecognizeCommands {
192
197
public:
193
- virtual TfLiteStatus ProcessLatestResults (const TfLiteTensor* latest_results,
198
+ virtual TfLiteStatus processLatestResults (const TfLiteTensor* latest_results,
194
199
const int32_t current_time_ms,
195
200
const char** found_command, uint8_t* score,
196
201
bool* is_new_command) = 0;
@@ -225,32 +230,41 @@ class TfLiteRecognizeCommands : public TfLiteAbstractRecognizeCommands<N> {
225
230
// further recognitions for a set time after one has been triggered, which can
226
231
// help reduce spurious recognitions.
227
232
228
- explicit TfLiteRecognizeCommands() {
233
+ TfLiteRecognizeCommands() {
229
234
previous_top_label_ = "silence";
230
235
previous_top_label_time_ = std::numeric_limits<int32_t>::min();
231
236
kCategoryCount = N;
232
237
}
233
238
234
239
/// Setup parameters from config
235
240
bool begin(TfLiteConfig cfg) override {
241
+ if (kCategoryCount==0){
242
+ LOGE("kCategoryCount must not be 0");
243
+ return false;
244
+ }
245
+ if (cfg.labels==nullptr){
246
+ LOGE("config.labels not defined");
247
+ return false;
248
+ }
236
249
average_window_duration_ms_ = cfg.average_window_duration_ms;
237
250
detection_threshold_ = cfg.detection_threshold;
238
251
suppression_ms_ = cfg.suppression_ms;
239
252
minimum_count_ = cfg.minimum_count;
240
253
kCategoryLabels = cfg.labels;
241
- if (cfg.labels==0){
242
- LOGW("config.labels not defined");
243
- return false;
244
- }
254
+ started = true;
245
255
return true;
246
256
}
247
257
248
258
// Call this with the results of running a model on sample data.
249
- virtual TfLiteStatus ProcessLatestResults (const TfLiteTensor* latest_results,
259
+ virtual TfLiteStatus processLatestResults (const TfLiteTensor* latest_results,
250
260
const int32_t current_time_ms,
251
261
const char** found_command, uint8_t* score,
252
262
bool* is_new_command) override {
253
263
LOGD(LOG_METHOD);
264
+ if (!started){
265
+ LOGE("TfLiteRecognizeCommands not started");
266
+ return kTfLiteError;
267
+ }
254
268
if ((latest_results->dims->size != 2) ||
255
269
(latest_results->dims->data[0] != 1) ||
256
270
(latest_results->dims->data[1] != kCategoryCount)) {
@@ -359,6 +373,7 @@ class TfLiteRecognizeCommands : public TfLiteAbstractRecognizeCommands<N> {
359
373
int32_t minimum_count_;
360
374
int kCategoryCount;
361
375
const char** kCategoryLabels = nullptr;
376
+ bool started = false;
362
377
363
378
// Working variables
364
379
TfLiteResultsQueue<N> previous_results_;
@@ -385,13 +400,20 @@ class TfLiteAudioFeatureProvider {
385
400
virtual bool begin(TfLiteConfig config) {
386
401
LOGD(LOG_METHOD);
387
402
cfg = config;
403
+ kMaxAudioSampleSize = cfg.audioSampleSize();
404
+ kStrideSampleSize = cfg.strideSampleSize();
405
+ kKeepSampleSize = kMaxAudioSampleSize - kStrideSampleSize;
406
+
407
+ // Allocate ring buffer
388
408
if (p_buffer == nullptr) {
389
- p_buffer = new audio_tools::RingBuffer<int16_t>(cfg. kMaxAudioSampleSize);
390
- LOGD("Allocating buffer for %d samples", cfg. kMaxAudioSampleSize);
409
+ p_buffer = new audio_tools::RingBuffer<int16_t>(kMaxAudioSampleSize);
410
+ LOGD("Allocating buffer for %d samples", kMaxAudioSampleSize);
391
411
}
412
+
392
413
// Initialize the feature data to default values.
393
414
if (feature_data_ == nullptr) {
394
- feature_data_ = new int8_t[cfg.featureElementCount()]{}; // initialzed array
415
+ feature_data_ = new int8_t[cfg.featureElementCount()];
416
+ memset(feature_data_,0, cfg.featureElementCount());
395
417
}
396
418
397
419
TfLiteStatus init_status = initializeMicroFeatures();
@@ -428,15 +450,13 @@ class TfLiteAudioFeatureProvider {
428
450
429
451
protected:
430
452
TfLiteConfig cfg;
431
- // int feature_size_;
432
453
int8_t* feature_data_ = nullptr;
433
- // Make sure we don't try to use cached information if this is the first
434
- // call into the provider.
435
- bool is_first_run_ = true;
436
- bool g_is_first_time = true;
437
- // const char** kCategoryLabels;
438
454
audio_tools::RingBuffer<int16_t>* p_buffer = nullptr;
439
455
FrontendState g_micro_features_state;
456
+ FrontendConfig config;
457
+ int kMaxAudioSampleSize;
458
+ int kStrideSampleSize;
459
+ int kKeepSampleSize;
440
460
441
461
// If we can avoid recalculating some slices, just move the existing
442
462
// data up in the spectrogram, to perform something like this: last time
@@ -452,26 +472,32 @@ class TfLiteAudioFeatureProvider {
452
472
// +-----------+ +-----------+
453
473
virtual void addSlice() {
454
474
LOGD(LOG_METHOD);
475
+ // shift feature_data_ by one slice one one
455
476
memmove(feature_data_, feature_data_ + cfg.kFeatureSliceSize,
456
477
(cfg.kFeatureSliceCount - 1) * cfg.kFeatureSliceSize);
457
478
458
479
// copy data from buffer to audio_samples
459
- int16_t audio_samples[cfg.kMaxAudioSampleSize];
460
- int audio_samples_size =
461
- p_buffer->readArray(audio_samples, cfg.kMaxAudioSampleSize);
480
+ int16_t audio_samples[kMaxAudioSampleSize];
481
+ int audio_samples_size = p_buffer->readArray(audio_samples, kMaxAudioSampleSize);
482
+
483
+ // check size
484
+ if (audio_samples_size!=kMaxAudioSampleSize){
485
+ LOGE("audio_samples_size=%d != kMaxAudioSampleSize=%d",audio_samples_size, kMaxAudioSampleSize);
486
+ }
462
487
488
+ // keep some data to be reprocessed - move by kStrideSampleSize
489
+ p_buffer->writeArray(audio_samples+kStrideSampleSize, kKeepSampleSize);
463
490
464
491
// the new slice data will always be stored at the end
465
- int8_t* new_slice_data =
466
- feature_data_ + ((cfg.kFeatureSliceCount - 1) * cfg.kFeatureSliceSize);
467
- size_t num_samples_read = audio_samples_size;
492
+ int8_t* new_slice_data = feature_data_ + ((cfg.kFeatureSliceCount - 1) * cfg.kFeatureSliceSize);
493
+ size_t num_samples_read = 0;
468
494
if (generateMicroFeatures(audio_samples, audio_samples_size,
469
- cfg.kFeatureSliceSize, new_slice_data,
495
+ new_slice_data, cfg.kFeatureSliceSize,
470
496
&num_samples_read) != kTfLiteOk) {
471
497
LOGE("Error generateMicroFeatures");
472
498
}
473
499
474
- // printFeatures();
500
+ // printFeatures();
475
501
}
476
502
477
503
/// For debugging: print feature matrix
@@ -483,11 +509,11 @@ class TfLiteAudioFeatureProvider {
483
509
}
484
510
Serial.println();
485
511
}
512
+ Serial.println("------------");
486
513
}
487
514
488
515
virtual TfLiteStatus initializeMicroFeatures() {
489
516
LOGD(LOG_METHOD);
490
- FrontendConfig config;
491
517
config.window.size_ms = cfg.kFeatureSliceDurationMs;
492
518
config.window.step_size_ms = cfg.kFeatureSliceStrideMs;
493
519
config.noise_reduction.smoothing_bits = 10;
@@ -506,38 +532,42 @@ class TfLiteAudioFeatureProvider {
506
532
config.log_scale.scale_shift = 6;
507
533
if (!FrontendPopulateState(&config, &g_micro_features_state,
508
534
cfg.kAudioSampleFrequency)) {
509
- LOGE("FrontendPopulateState () failed");
535
+ LOGE("frontendPopulateState () failed");
510
536
return kTfLiteError;
511
537
}
512
- g_is_first_time = true;
513
538
return kTfLiteOk;
514
539
}
515
540
516
- // This is not exposed in any header, and is only used for testing, to ensure
517
- // that the state is correctly set up before generating results.
518
- void setMicroFeaturesNoiseEstimates(const uint32_t* estimate_presets) {
519
- LOGD(LOG_METHOD);
520
- for (int i = 0; i < g_micro_features_state.filterbank.num_channels; ++i) {
521
- g_micro_features_state.noise_reduction.estimate[i] = estimate_presets[i];
522
- }
523
- }
541
+ // // This is not exposed in any header, and is only used for testing, to ensure
542
+ // // that the state is correctly set up before generating results.
543
+ // void setMicroFeaturesNoiseEstimates(const uint32_t* estimate_presets) {
544
+ // LOGD(LOG_METHOD);
545
+ // for (int i = 0; i < g_micro_features_state.filterbank.num_channels; ++i) {
546
+ // g_micro_features_state.noise_reduction.estimate[i] = estimate_presets[i];
547
+ // }
548
+ // }
524
549
525
550
virtual TfLiteStatus generateMicroFeatures(const int16_t* input, int input_size,
526
- int output_size, int8_t* output,
551
+ int8_t* output, int output_size,
527
552
size_t* num_samples_read) {
528
553
LOGD(LOG_METHOD);
529
- const int16_t* frontend_input;
530
- if (g_is_first_time) {
531
- frontend_input = input;
532
- g_is_first_time = false;
533
- } else {
534
- frontend_input = input;
535
- }
554
+ const int16_t* frontend_input=input;
536
555
537
556
// Apply FFT
538
557
FrontendOutput frontend_output = FrontendProcessSamples(
539
558
&g_micro_features_state, frontend_input, input_size, num_samples_read);
540
559
560
+ // Check size
561
+ if (output_size != frontend_output.size){
562
+ LOGE("output_size=%d, frontend_output.size=%d",output_size, frontend_output.size);
563
+ }
564
+
565
+ // // check generated features
566
+ // if (input_size != *num_samples_read){
567
+ // LOGE("audio_samples_size=%d vs num_samples_read=%d", input_size, *num_samples_read);
568
+ // }
569
+
570
+
541
571
for (size_t i = 0; i < frontend_output.size; ++i) {
542
572
// These scaling values are derived from those used in input_data.py in
543
573
// the training pipeline. The feature pipeline outputs 16-bit signed
@@ -675,7 +705,7 @@ class TfLiteAudioOutput : public AudioPrint {
675
705
// we submit int16 data which will be reduced to 8bits so we can send
676
706
// double the amount - 2 channels will be recuced to 1 so we multiply by
677
707
// number of channels
678
- int maxBytes = cfg.kMaxAudioSampleSize * 2 * cfg.kAudioChannels;
708
+ int maxBytes = cfg.audioSampleSize() * 2 * cfg.kAudioChannels;
679
709
while (open > 0) {
680
710
int len = min(open, maxBytes);
681
711
result += processAudio(audio + pos, len);
@@ -810,10 +840,10 @@ class TfLiteAudioOutput : public AudioPrint {
810
840
uint8_t score = 0;
811
841
bool is_new_command = false;
812
842
813
- TfLiteStatus process_status = recognizer->ProcessLatestResults (
843
+ TfLiteStatus process_status = recognizer->processLatestResults (
814
844
output, current_time, &found_command, &score, &is_new_command);
815
845
if (process_status != kTfLiteOk) {
816
- LOGE("TfLiteRecognizeCommands::ProcessLatestResults () failed");
846
+ LOGE("TfLiteRecognizeCommands::processLatestResults () failed");
817
847
return 0;
818
848
}
819
849
// Do something based on the recognized command. The default
0 commit comments