LearnedVector · kaankutan · Oct 29, 2020 · Oct 29, 2020 · Oct 29, 2020
diff --git a/VoiceAssistant/wakeword/engine.py b/VoiceAssistant/wakeword/engine.py
@@ -7,16 +7,19 @@
 import torchaudio
 import torch
 import numpy as np
-from neuralnet.dataset import get_featurizer
 from threading import Event
+from array import array
+from neuralnet.dataset import get_featurizer
 
 
 class Listener:
-
-    def __init__(self, sample_rate=8000, record_seconds=2):
+    def __init__(self, sample_rate=8000, record_seconds=2, threshold = 300):
         self.chunk = 1024
         self.sample_rate = sample_rate
         self.record_seconds = record_seconds
+        self.threshold = threshold
+        self.silent = False
+
         self.p = pyaudio.PyAudio()
         self.stream = self.p.open(format=pyaudio.paInt16,
                         channels=1,
@@ -25,10 +28,18 @@ def __init__(self, sample_rate=8000, record_seconds=2):
                         output=True,
                         frames_per_buffer=self.chunk)
 
+    def is_silent(self, data):
+        "Returns 'True' if below the 'silent' threshold"
+        return max(data) < self.threshold
+
     def listen(self, queue):
         while True:
-            data = self.stream.read(self.chunk , exception_on_overflow=False)
+            data = self.stream.read(self.chunk, exception_on_overflow=False)
             queue.append(data)
+            if self.is_silent(array('h', data)):
+                self.silent = True
+            else:
+                self.silent = False
             time.sleep(0.01)
 
     def run(self, queue):
@@ -40,11 +51,12 @@ def run(self, queue):
 class WakeWordEngine:
 
     def __init__(self, model_file):
-        self.listener = Listener(sample_rate=8000, record_seconds=2)
+        self.listener = Listener(sample_rate=8000, record_seconds=2, threshold = 150)
         self.model = torch.jit.load(model_file)
         self.model.eval().to('cpu')  #run on cpu
         self.featurizer = get_featurizer(sample_rate=8000)
         self.audio_q = list()
+        self.prediction = []
 
     def save(self, waveforms, fname="wakeword_temp"):
         wf = wave.open(fname, "wb")
@@ -64,43 +76,67 @@ def save(self, waveforms, fname="wakeword_temp"):
     def predict(self, audio):
         with torch.no_grad():
             fname = self.save(audio)
-            waveform, _ = torchaudio.load(fname, normalization=False)  # don't normalize on train
+            waveform, _ = torchaudio.load(fname)
             mfcc = self.featurizer(waveform).transpose(1, 2).transpose(0, 1)
 
-            # TODO: read from buffer instead of saving and loading file
-            # waveform = torch.Tensor([np.frombuffer(a, dtype=np.int16) for a in audio]).flatten()
-            # mfcc = self.featurizer(waveform).transpose(0, 1).unsqueeze(1)
-
             out = self.model(mfcc)
-            pred = torch.round(torch.sigmoid(out))
-            return pred.item()
-
-    def inference_loop(self, action):
+            value = torch.round(torch.sigmoid(out)).item()
+            acc =  np.asanyarray(torch.sigmoid(out)).tolist()[0][0][0]
+            return value, acc
+
+    def inference_loop(self, callback, sensitivity):
+        """
+        If the situation is silent, it starts predicting.
+        Args: sensitivity. the lower the number the more sensitive the
+        wakeword is to activation.
+        """
         while True:
-            if len(self.audio_q) > 15:  # remove part of stream
-                diff = len(self.audio_q) - 15
-                for _ in range(diff):
-                    self.audio_q.pop(0)
-                action(self.predict(self.audio_q))
-            elif len(self.audio_q) == 15:
-                action(self.predict(self.audio_q))
+            if not self.listener.silent:
+                if len(self.audio_q) > 25:
+
+                    while True:
+                        if len(self.audio_q) > 25:
+                            self.audio_q.pop(0)
+                        else:
+                            break
+
+                    value, acc = self.predict(self.audio_q)
+
+                    if value == 1.0:
+                        self.prediction.append(acc)
+                    else:
+                        self.prediction = []
+
             time.sleep(0.05)
 
-    def run(self, action):
+            if self.listener.silent and len(self.prediction) > 2: #Change depending on the length of your model
+                avg_acc = 0
+                #Calculate sensitivity
+                for i in self.prediction:
+                    avg_acc += i
+
+                avg_acc = avg_acc / len(self.prediction)
+                self.prediction = []
+
+                if avg_acc > sensitivity:
+                    callback()
+
+            elif self.listener.silent > 2 and not len(self.prediction):
+                self.prediction = []
+
+    def run(self, callback, sensitivity):
         self.listener.run(self.audio_q)
-        thread = threading.Thread(target=self.inference_loop,
-                                    args=(action,), daemon=True)
+        thread = threading.Thread(target=self.inference_loop, args = (callback, sensitivity), daemon=True)
         thread.start()
 
 
 class DemoAction:
     """This demo action will just randomly say Arnold Schwarzenegger quotes
-
-        args: sensitivty. the lower the number the more sensitive the
+        args: sensitivity. the lower the number the more sensitive the
         wakeword is to activation.
     """
-    def __init__(self, sensitivity=10):
-        # import stuff here to prevent engine.py from 
+    def __init__(self):
+        # import stuff here to prevent engine.py from
         # importing unecessary modules during production usage
         import os
         import subprocess
@@ -111,7 +147,6 @@ def __init__(self, sensitivity=10):
         self.subprocess = subprocess
         self.detect_in_row = 0
 
-        self.sensitivity = sensitivity
         folder = realpath(join(realpath(__file__), '..', '..', '..', 'fun', 'arnold_audio'))
         self.arnold_mp3 = [
             os.path.join(folder, x)
@@ -120,38 +155,28 @@ def __init__(self, sensitivity=10):
         ]
 
     def __call__(self, prediction):
-        if prediction == 1:
-            self.detect_in_row += 1
-            if self.detect_in_row == self.sensitivity:
-                self.play()
-                self.detect_in_row = 0
-        else:
-            self.detect_in_row = 0
-
-    def play(self):
         filename = self.random.choice(self.arnold_mp3)
         try:
             print("playing", filename)
             self.subprocess.check_output(['play', '-v', '.1', filename])
         except Exception as e:
             print(str(e))
 
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="demoing the wakeword engine")
     parser.add_argument('--model_file', type=str, default=None, required=True,
                         help='optimized file to load. use optimize_graph.py')
-    parser.add_argument('--sensitivty', type=int, default=10, required=False,
+    parser.add_argument('--sensitivity', type=float, default=0.85, required=False,
                         help='lower value is more sensitive to activations')
 
-    args = parser.parse_args()
-    wakeword_engine = WakeWordEngine(args.model_file)
-    action = DemoAction(sensitivity=10)
-
     print("""\n*** Make sure you have sox installed on your system for the demo to work!!!
     If you don't want to use sox, change the play function in the DemoAction class
     in engine.py module to something that works with your system.\n
     """)
-    # action = lambda x: print(x)
-    wakeword_engine.run(action)
+
+    args = parser.parse_args()
+    wakeword_engine = WakeWordEngine(args.model_file)
+    action = DemoAction()
+
+    wakeword_engine.run(callback = action, sensitivity = args.sensitivity)
     threading.Event().wait()