twilio-labs · kashikhan1 · Sep 9, 2024 · Sep 9, 2024
diff --git a/app.js b/app.js
@@ -36,6 +36,7 @@ app.ws('/connection', (ws) => {
     // Filled in from start message
     let streamSid;
     let callSid;
+    let globalInterval;
 
     const gptService = new GptService();
     const streamService = new StreamService(ws);
@@ -71,28 +72,36 @@ app.ws('/connection', (ws) => {
       }
     });
 
-    transcriptionService.on('utterance', async (text) => {
+    transcriptionService.on('utterance', async ({ text, duration, start }) => {  
       // This is a bit of a hack to filter out empty utterances
-      if(marks.length > 0 && text?.length > 5) {
+      if(marks.length > 0 && text?.trim()?.length) {
         console.log('Twilio -> Interruption, Clearing stream'.red);
         ws.send(
           JSON.stringify({
             streamSid,
             event: 'clear',
           })
         );
+      } else if (duration > 4 && !text?.trim()?.length && !marks.length) {
+        globalInterval = setInterval(() => {
+          console.log('Interval running...');
+        }, 2000); 
+        console.log(`durration=${duration}, start=${start}`.red);
+        ttsService.generate({partialResponseIndex: null, partialResponse: 'sorry! are you there i dont listen anything?'}, ++interactionCount);
       }
     });
 
     transcriptionService.on('transcription', async (text) => {
       if (!text) { return; }
       console.log(`Interaction ${interactionCount} – STT -> GPT: ${text}`.yellow);
+      clearInterval(globalInterval);
       gptService.completion(text, interactionCount);
       interactionCount += 1;
     });
 
     gptService.on('gptreply', async (gptReply, icount) => {
       console.log(`Interaction ${icount}: GPT -> TTS: ${gptReply.partialResponse}`.green );
+      clearInterval(globalInterval);
       ttsService.generate(gptReply, icount);
     });
 
@@ -104,6 +113,7 @@ app.ws('/connection', (ws) => {
 
     streamService.on('audiosent', (markLabel) => {
       marks.push(markLabel);
+      clearInterval(globalInterval);
     });
   } catch (err) {
     console.log(err);

diff --git a/package.json b/package.json
@@ -14,7 +14,7 @@
   "author": "Charlie Weems",
   "license": "MIT",
   "dependencies": {
-    "@deepgram/sdk": "^3.3.4",
+    "@deepgram/sdk": "^3.6.0",
     "colors": "^1.4.0",
     "dotenv": "^16.3.1",
     "express": "^4.19.2",

diff --git a/services/transcription-service.js b/services/transcription-service.js
@@ -15,7 +15,9 @@ class TranscriptionService extends EventEmitter {
       punctuate: true,
       interim_results: true,
       endpointing: 200,
-      utterance_end_ms: 1000
+      utterance_end_ms: 1000,
+      smart_format: true,
+      vad_events: true,
     });
 
     this.finalResult = '';
@@ -44,7 +46,7 @@ class TranscriptionService extends EventEmitter {
         // console.log(text, "is_final: ", transcription?.is_final, "speech_final: ", transcription.speech_final);
         // if is_final that means that this chunk of the transcription is accurate and we need to add it to the finalResult 
         if (transcriptionEvent.is_final === true && text.trim().length > 0) {
-          this.finalResult += ` ${text}`;
+          this.finalResult += ` ${text.trim()}`;
           // if speech_final and is_final that means this text is accurate and it's a natural pause in the speakers speech. We need to send this to the assistant for processing
           if (transcriptionEvent.speech_final === true) {
             this.speechFinal = true; // this will prevent a utterance end which shows up after speechFinal from sending another response
@@ -55,7 +57,12 @@ class TranscriptionService extends EventEmitter {
             this.speechFinal = false;
           }
         } else {
-          this.emit('utterance', text);
+          console.log(`STT -> Deepgram transcription: ${text}`.yellow);
+            this.emit('utterance', {
+              text: text,
+              duration: transcriptionEvent.duration,
+              start: transcriptionEvent.start,
+            });
         }
       });