Skip to content

Commit 8b9ef57

Browse files
authored
feat: local transcription of audio (#110)
* feat: free alternative for audio transcription * docs: Updated README.md with info on local transcription
1 parent c4c4dc5 commit 8b9ef57

File tree

7 files changed

+137
-11
lines changed

7 files changed

+137
-11
lines changed

.env.example

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,17 @@ BING_COOKIES="MUID=08AC######6940; MUIDB=08AC#####6940; ...a very long string...
77
# Determines whether the bot should detect and convert your voice messages into text
88
# Accepted values are "true" or "false"
99
TRANSCRIPTION_ENABLED="false"
10-
# This must be set for voice message transcription functionality
10+
# There are 2 ways to transcribe audio: using WhisperAI, which costs US$0.06 per 10 minutes of audio, or using WhisperAI locally.
11+
# Local transcription is slower and provides worse results, but it is free.
12+
# If you choose to use the local method, you need to install WhisperAI. Refer to the readme.md file for more information.
13+
TRANSCRIPTION_METHOD="local" # options are 'local' or 'api'
14+
# TRANSCRIPTION_LANGUAGE improves the transcription results but is not required. If you only plan to send audio in one language,
15+
# it is recommended to specify the language. If you will use multiple languages, leave it blank.
16+
# It's only needed in local mode.
17+
TRANSCRIPTION_LANGUAGE="" # "--language pt" (pt = Portuguese), list of languages: https://github.com/openai/whisper/blob/main/whisper/tokenizer.py (OPTIONAL)
18+
# This must be set if you are going to use the WhisperAI API for voice message transcription functionality.
1119
OPENAI_API_KEY="sk-90..."
12-
# Determines whether the bot should reply with the transcribed text from your voice messages
20+
# Determines whether the bot should reply with the transcribed text from your voice messages.
1321
# Accepted values are "true" or "false"
1422
REPLY_TRANSCRIPTION="false"
1523

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,11 @@ Sydney is designed to respond to natural language queries from users. You can as
117117

118118
### Voice Messages
119119

120-
To utilize voice messages, make sure you have FFMPEG installed on your machine and an OpenAI API key. Please note that using the voice transcription API (Whisper AI) provided by OpenAI comes with a cost of US$0.06 per 10 minutes.
120+
To utilize voice messages, ensure that FFMPEG is installed on your machine. There are two alternatives for voice transcription: using the Whisper AI API (which incurs a cost of US$0.06 per 10 minutes of audio) or using the local method. The local method is slower and less precise but is free.
121121

122-
Once you have installed FFMPEG and obtained your OpenAI API key, you will need to place the key in the `.env` file under the `OPENAI_API_KEY` variable. Additionally, set `TRANSCRIPTION_ENABLED` to `"TRUE"`. After restarting the bot, you can proceed to utilize it.
122+
1. WhisperAPI: When using the WhisperAPI, you only need to obtain an OpenAI API key. Place the key in the `.env` file under the `OPENAI_API_KEY` variable. Additionally, set `TRANSCRIPTION_ENABLED` to `"true"` and `TRANSCRIPTION_METHOD` to `"api"`. After restarting the bot, you can proceed to utilize it.
123+
124+
2. Local mode: To use the local mode, you need to install Python and Whisper. Follow the setup instructions [here](https://github.com/openai/whisper#setup). Then, make some changes in the `.env` file. Change `TRANSCRIPTION_ENABLED` to `"true"` and `TRANSCRIPTION_METHOD` to `"local"`. After restarting the bot, you can proceed to utilize it.
123125

124126
### Group Chat
125127

package.json

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@
1111
"author": "Matheus Veiga <[email protected]>",
1212
"license": "MIT",
1313
"devDependencies": {
14+
"@types/common-tags": "^1.8.1",
1415
"@types/fluent-ffmpeg": "^2.1.21",
1516
"@types/node": "^20.3.2",
16-
"@types/common-tags": "^1.8.1",
1717
"@types/node-schedule": "^2.1.0",
1818
"@types/uuid": "^9.0.1",
1919
"nodemon": "^2.0.22",
@@ -26,11 +26,15 @@
2626
"@keyv/sqlite": "^3.6.5",
2727
"@types/qrcode-terminal": "^0.12.0",
2828
"@waylaidwanderer/chatgpt-api": "1.37.0",
29+
"common-tags": "^1.8.2",
30+
"crypto": "^1.0.1",
2931
"fluent-ffmpeg": "^2.1.2",
32+
"fs": "^0.0.1-security",
3033
"keyv": "^4.5.2",
31-
"openai": "^3.2.1",
32-
"common-tags": "^1.8.2",
3334
"node-schedule": "^2.1.1",
35+
"openai": "^3.2.1",
36+
"os": "^0.1.2",
37+
"path": "^0.12.7",
3438
"qrcode-terminal": "^0.12.0",
3539
"serialize-error": "^11.0.0",
3640
"uuid": "^9.0.0",
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import { execSync } from "child_process";
2+
import { randomUUID } from "crypto";
3+
import fs from "fs";
4+
import path from "path";
5+
import { convertOggToWav } from "./audio-transcription";
6+
7+
export async function transcribeAudioLocal(
8+
audioBuffer: Buffer
9+
): Promise<string> {
10+
const folderPath = "./"; // Change this to specify the project folder
11+
const filename = randomUUID();
12+
13+
const oggPath = path.join(folderPath, filename + ".ogg");
14+
fs.writeFileSync(oggPath, audioBuffer);
15+
16+
const wavPath = path.join(folderPath, filename + ".wav");
17+
await convertOggToWav(oggPath, wavPath);
18+
19+
// Deletes the ogg file as it is no longer needed
20+
fs.unlinkSync(oggPath);
21+
22+
const output = execSync(
23+
`whisper ${wavPath} ${process.env.TRANSCRIPTION_LANGUAGE}`,
24+
{
25+
encoding: "utf-8",
26+
}
27+
);
28+
29+
// Delete tmp file
30+
fs.unlinkSync(wavPath);
31+
32+
// Delete whisper created tmp files
33+
const extensions = [
34+
".srt",
35+
".txt",
36+
".vtt",
37+
".json",
38+
".srt.json",
39+
".tsv",
40+
".srt.efb",
41+
];
42+
43+
const parsedText = parseTextAfterTimeFrame(output);
44+
for (const extension of extensions) {
45+
const filePath = path.join(folderPath, `${filename}${extension}`);
46+
if (fs.existsSync(filePath)) {
47+
fs.unlinkSync(filePath);
48+
}
49+
}
50+
if (parsedText) {
51+
return parsedText;
52+
} else {
53+
return "[System] User tried to send an audio message, but the transcription failed. Please ask them to either write it in text or try again.";
54+
}
55+
}
56+
57+
function parseTextAfterTimeFrame(text: string) {
58+
const textMatch = text.match(
59+
/\[(\d{2}:\d{2}\.\d{3})\s-->\s(\d{2}:\d{2}\.\d{3})\]\s(.+)/
60+
); // Extract the text
61+
62+
if (textMatch) {
63+
return textMatch[3].trim();
64+
}
65+
66+
return null; // Return null if match is not found
67+
}

src/handlers/audio-transcription.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import os from "os";
55
import path from "path";
66
import { openai } from "../clients/openai";
77

8-
function convertOggToWav(oggPath: string, wavPath: string) {
8+
export function convertOggToWav(oggPath: string, wavPath: string) {
99
return new Promise((resolve, reject) => {
1010
ffmpeg(oggPath)
1111
.toFormat("wav")

src/handlers/message.ts

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import { counterRequests } from "./requests-counter";
1010
import { jsonSafeParse, react } from "../utils";
1111
import { scheduleReminder } from "./reminder";
1212
import { reminderSchema } from "../schemas/reminder";
13+
import { transcribeAudioLocal } from "./audio-transcription-local";
1314

1415
function appendSources(sources: SourceAttribution[]) {
1516
let sourcesString = "\n\n";
@@ -84,9 +85,18 @@ async function handleAudioMessage(message: Message, media: MessageMedia) {
8485
const audioBuffer = Buffer.from(media.data, "base64");
8586

8687
try {
87-
const transcription = await transcribeAudio(audioBuffer);
88+
let transcription;
89+
if (process.env.TRANSCRIPTION_METHOD === "local") {
90+
transcription = await transcribeAudioLocal(audioBuffer);
91+
} else if (process.env.TRANSCRIPTION_METHOD === "api") {
92+
transcription = await transcribeAudio(audioBuffer);
93+
} else {
94+
await message.reply(
95+
"There was a problem in the transcription of the message, the problem is related to the TRANSCRIPTION_METHOD in the .env file."
96+
);
97+
return;
98+
}
8899
message.body = transcription;
89-
90100
if (process.env.REPLY_TRANSCRIPTION === "true")
91101
await message.reply(`Transcription:\n\n${transcription}`);
92102

yarn.lock

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -828,6 +828,11 @@ cross-spawn@^7.0.3:
828828
shebang-command "^2.0.0"
829829
which "^2.0.1"
830830

831+
crypto@^1.0.1:
832+
version "1.0.1"
833+
resolved "https://registry.yarnpkg.com/crypto/-/crypto-1.0.1.tgz#2af1b7cad8175d24c8a1b0778255794a21803037"
834+
integrity sha512-VxBKmeNcqQdiUQUW2Tzq0t377b54N2bMtXO/qiLa+6eRRmmC4qT3D4OnTGoT/U6O9aklQ/jTwbOtRMTTY8G0Ig==
835+
831836
debug@4, [email protected], debug@^4.0.0, debug@^4.1.0, debug@^4.1.1, debug@^4.3.3, debug@^4.3.4:
832837
version "4.3.4"
833838
resolved "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz"
@@ -1171,6 +1176,11 @@ fs.realpath@^1.0.0:
11711176
resolved "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz"
11721177
integrity sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==
11731178

1179+
fs@^0.0.1-security:
1180+
version "0.0.1-security"
1181+
resolved "https://registry.yarnpkg.com/fs/-/fs-0.0.1-security.tgz#8a7bd37186b6dddf3813f23858b57ecaaf5e41d4"
1182+
integrity sha512-3XY9e1pP0CVEUCdj5BmfIZxRBTSDycnbqhIOGec9QYtmVH2fbLpj86CFWkrNOkt/Fvty4KZG5lTglL9j/gJ87w==
1183+
11741184
fsevents@~2.3.2:
11751185
version "2.3.2"
11761186
resolved "https://registry.yarnpkg.com/fsevents/-/fsevents-2.3.2.tgz#8a526f78b8fdf4623b709e0b975c52c24c02fd1a"
@@ -1365,6 +1375,11 @@ inherits@2, inherits@^2.0.3, inherits@^2.0.4, inherits@~2.0.0, inherits@~2.0.3:
13651375
resolved "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz"
13661376
integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==
13671377

1378+
1379+
version "2.0.3"
1380+
resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.3.tgz#633c2c83e3da42a502f52466022480f4208261de"
1381+
integrity sha512-x00IRNXNy63jwGkJmzPigoySHbaqpNuzKbBOmzK+g2OdZpQ9w+sxCN+VSB3ja7IAge2OP2qpfxTjeNcyjmW1uw==
1382+
13681383
inquirer-autocomplete-prompt@^3.0.0:
13691384
version "3.0.0"
13701385
resolved "https://registry.yarnpkg.com/inquirer-autocomplete-prompt/-/inquirer-autocomplete-prompt-3.0.0.tgz#b00478882feb326b34e0754a1695d912e387a63b"
@@ -2016,6 +2031,11 @@ os-tmpdir@~1.0.2:
20162031
resolved "https://registry.yarnpkg.com/os-tmpdir/-/os-tmpdir-1.0.2.tgz#bbe67406c79aa85c5cfec766fe5734555dfa1274"
20172032
integrity sha512-D2FR03Vir7FIu45XBY20mTb+/ZSWB00sjU9jdQXt83gDrI4Ztz5Fs7/yy74g2N5SVQY4xY1qDr4rNddwYRVX0g==
20182033

2034+
os@^0.1.2:
2035+
version "0.1.2"
2036+
resolved "https://registry.yarnpkg.com/os/-/os-0.1.2.tgz#f29a50c62908516ba42652de42f7038600cadbc2"
2037+
integrity sha512-ZoXJkvAnljwvc56MbvhtKVWmSkzV712k42Is2mA0+0KTSRakq5XXuXpjZjgAt9ctzl51ojhQWakQQpmOvXWfjQ==
2038+
20192039
p-defer@^3.0.0:
20202040
version "3.0.0"
20212041
resolved "https://registry.yarnpkg.com/p-defer/-/p-defer-3.0.0.tgz#d1dceb4ee9b2b604b1d94ffec83760175d4e6f83"
@@ -2070,6 +2090,14 @@ path-key@^3.0.0, path-key@^3.1.0:
20702090
resolved "https://registry.yarnpkg.com/path-key/-/path-key-3.1.1.tgz#581f6ade658cbba65a0d3380de7753295054f375"
20712091
integrity sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==
20722092

2093+
path@^0.12.7:
2094+
version "0.12.7"
2095+
resolved "https://registry.yarnpkg.com/path/-/path-0.12.7.tgz#d4dc2a506c4ce2197eb481ebfcd5b36c0140b10f"
2096+
integrity sha512-aXXC6s+1w7otVF9UletFkFcDsJeO7lSZBPUQhtb5O0xJe8LtYhj/GxldoL09bBj9+ZmE2hNoHqQSFMN5fikh4Q==
2097+
dependencies:
2098+
process "^0.11.1"
2099+
util "^0.10.3"
2100+
20732101
pend@~1.2.0:
20742102
version "1.2.0"
20752103
resolved "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz"
@@ -2137,7 +2165,7 @@ process-warning@^2.0.0, process-warning@^2.2.0:
21372165
resolved "https://registry.yarnpkg.com/process-warning/-/process-warning-2.2.0.tgz#008ec76b579820a8e5c35d81960525ca64feb626"
21382166
integrity sha512-/1WZ8+VQjR6avWOgHeEPd7SDQmFQ1B5mC1eRXsCm5TarlNmx/wCsa5GEaxGm05BORRtyG/Ex/3xq3TuRvq57qg==
21392167

2140-
process@^0.11.10:
2168+
process@^0.11.1, process@^0.11.10:
21412169
version "0.11.10"
21422170
resolved "https://registry.yarnpkg.com/process/-/process-0.11.10.tgz#7332300e840161bda3e69a1d1d91a7d4bc16f182"
21432171
integrity sha512-cdGef/drWFoydD1JsMzuFf8100nZl+GT+yacc2bEced5f9Rjk4z+WtFUTBu9PhOi9j/jfmBPu0mMEY4wIdAF8A==
@@ -2801,6 +2829,13 @@ util-deprecate@^1.0.1, util-deprecate@~1.0.1:
28012829
resolved "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz"
28022830
integrity sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==
28032831

2832+
util@^0.10.3:
2833+
version "0.10.4"
2834+
resolved "https://registry.yarnpkg.com/util/-/util-0.10.4.tgz#3aa0125bfe668a4672de58857d3ace27ecb76901"
2835+
integrity sha512-0Pm9hTQ3se5ll1XihRic3FDIku70C+iHUdT/W926rSgHV5QgXsYbKZN8MSC3tJtSkhuROzvsQjAaFENRXr+19A==
2836+
dependencies:
2837+
inherits "2.0.3"
2838+
28042839
uuid@^9.0.0:
28052840
version "9.0.0"
28062841
resolved "https://registry.yarnpkg.com/uuid/-/uuid-9.0.0.tgz#592f550650024a38ceb0c562f2f6aa435761efb5"

0 commit comments

Comments
 (0)