Skip to content

Commit 5d62fab

Browse files
authored
Merge pull request #61 from ks6088ts-labs/feature/issue-60_transcription
use whisper model, instead of AI Speech
2 parents 55ff823 + 38c8f87 commit 5d62fab

File tree

7 files changed

+80
-86
lines changed

7 files changed

+80
-86
lines changed

azure_ai_speech.env.sample

Lines changed: 0 additions & 3 deletions
This file was deleted.

compose.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,6 @@ services:
2525
ports:
2626
- 8501:8501
2727
volumes:
28-
- ./azure_ai_speech.env:/app/azure_ai_speech.env
28+
- ./frontend_transcription.env:/app/frontend_transcription.env
2929
environment:
3030
- PYTHONUNBUFFERED=1

docs/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
- [Streamlit](https://docs.streamlit.io/get-started/installation/command-line)
2727
- [Streamlit API cheat sheet](https://docs.streamlit.io/develop/quick-reference/cheat-sheet)
2828
- [Streamlit > Display progress and status](https://docs.streamlit.io/develop/api-reference/status)
29+
- [streamlit-audiorecorder](https://github.com/theevann/streamlit-audiorecorder)
30+
- [Build a basic LLM chat app](https://docs.streamlit.io/develop/tutorials/llms/build-conversational-apps)
2931
- [aiohttp > Installing all speedups in one command](https://docs.aiohttp.org/en/stable/#installing-all-speedups-in-one-command)
3032
- [Python & aiohttp: How to upload files to a remote server](https://www.slingacademy.com/article/python-aiohttp-how-to-upload-files-to-a-remote-server/)
3133

Lines changed: 44 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,86 +1,65 @@
11
import logging
2-
import time
3-
from os import getenv
2+
from os import getenv, makedirs
43

5-
import azure.cognitiveservices.speech as speechsdk
64
import streamlit as st
7-
from azure.cognitiveservices.speech.speech import SpeechRecognitionEventArgs
5+
from audiorecorder import audiorecorder
86
from dotenv import load_dotenv
7+
from openai import AzureOpenAI
8+
from openai.types.audio import Transcription
99

10-
load_dotenv("azure_ai_speech.env")
10+
load_dotenv("frontend_transcription.env")
1111
logger = logging.getLogger(__name__)
12-
done = False
1312

1413

15-
def transcript(
16-
subscription: str,
17-
region: str,
18-
speech_recognition_language: str,
19-
):
20-
speech_recognizer = speechsdk.SpeechRecognizer(
21-
speech_config=speechsdk.SpeechConfig(
22-
subscription=subscription,
23-
region=region,
24-
speech_recognition_language=speech_recognition_language,
25-
),
26-
audio_config=speechsdk.audio.AudioConfig(
27-
use_default_microphone=True,
28-
),
14+
# TODO: call backend API instead of using Azure OpenAI
15+
def get_transcription(file_path: str) -> Transcription:
16+
client = AzureOpenAI(
17+
api_key=getenv("AZURE_OPENAI_API_KEY"),
18+
api_version=getenv("AZURE_OPENAI_API_VERSION"),
19+
azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"),
2920
)
3021

31-
def stop_cb(evt: SpeechRecognitionEventArgs):
32-
logger.debug(f"CLOSING on {evt}")
33-
speech_recognizer.stop_continuous_recognition()
34-
35-
def recognized_cb(evt: SpeechRecognitionEventArgs):
36-
logger.debug(f"RECOGNIZED: {evt}")
37-
new_text = evt.result.text.strip()
38-
logger.info(new_text)
39-
# FIXME: App does not show the transcription
40-
41-
speech_recognizer.recognizing.connect(lambda evt: logger.debug(f"RECOGNIZING: {evt}"))
42-
speech_recognizer.recognized.connect(recognized_cb)
43-
speech_recognizer.session_started.connect(lambda evt: logger.debug(f"SESSION STARTED: {evt}"))
44-
speech_recognizer.session_stopped.connect(lambda evt: logger.debug(f"SESSION STOPPED {evt}"))
45-
speech_recognizer.canceled.connect(lambda evt: logger.debug(f"CANCELED {evt}"))
46-
speech_recognizer.session_stopped.connect(stop_cb)
47-
speech_recognizer.canceled.connect(stop_cb)
48-
49-
speech_recognizer.start_continuous_recognition()
50-
51-
global done
52-
53-
if st.button("Stop transcription", key="stop_transcription"):
54-
# FIXME: App does not stop transcription
55-
logger.info("Stop transcription")
56-
speech_recognizer.stop_continuous_recognition()
57-
done = True
58-
59-
while done is False:
60-
time.sleep(0.5)
22+
return client.audio.transcriptions.create(
23+
file=open(file=file_path, mode="rb"),
24+
model=getenv("AZURE_OPENAI_WHISPER_MODEL"),
25+
)
6126

6227

6328
def start(
6429
backend_url: str,
6530
log_level: int,
6631
):
67-
global done
68-
32+
# Logger
6933
logger.setLevel(log_level)
7034
logger.debug(f"set log level to {log_level}")
7135

7236
st.write("Transcription")
7337

74-
if st.button("Start transcription", key="start_transcription"):
75-
logger.info("Start transcription...")
76-
done = False
77-
try:
78-
with st.spinner("Transcribing..."):
79-
transcript(
80-
subscription=getenv("AZURE_AI_SPEECH_SUBSCRIPTION_KEY"),
81-
region=getenv("AZURE_AI_SPEECH_REGION"),
82-
speech_recognition_language=getenv("AZURE_AI_SPEECH_RECOGNITION_LANGUAGE"),
83-
)
84-
except Exception as e:
85-
st.write(f"Error: {e}")
86-
logger.error(f"Error: {e}")
38+
# create directory if not exists
39+
# TODO: remove hard coded path
40+
makedirs("artifacts", exist_ok=True)
41+
42+
# Audio settings
43+
audio_file_path = "artifacts/audio.wav"
44+
45+
audio = audiorecorder(
46+
start_prompt="",
47+
stop_prompt="",
48+
pause_prompt="",
49+
show_visualizer=True,
50+
key=None,
51+
)
52+
53+
if len(audio) > 0:
54+
# To play audio in frontend:
55+
st.audio(audio.export().read())
56+
# To save audio to a file, use pydub export method:
57+
audio.export(audio_file_path, format="wav")
58+
# To get audio properties, use pydub AudioSegment properties:
59+
st.write(
60+
f"Frame rate: {audio.frame_rate}, Frame width: {audio.frame_width}, Duration: {audio.duration_seconds} seconds" # noqa
61+
)
62+
with st.spinner("Transcribing..."):
63+
# Get transcription
64+
transcription = get_transcription(audio_file_path)
65+
st.write(f"Transcription: {transcription.text}")

frontend_transcription.env.sample

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
AZURE_OPENAI_ENDPOINT = "https://<aoai-name>.openai.azure.com"
2+
AZURE_OPENAI_API_KEY = "<aoai-api-key>"
3+
AZURE_OPENAI_API_VERSION = "2024-05-01-preview"
4+
AZURE_OPENAI_WHISPER_MODEL = "whisper"
5+
AZURE_OPENAI_GPT_MODEL = "gpt-4o"

poetry.lock

Lines changed: 27 additions & 16 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,14 @@ azure-cosmos = "^4.6.1"
3939
[tool.poetry.group.frontend.dependencies]
4040
streamlit = "^1.33.0"
4141
aiohttp = {extras = ["speedups"], version = "^3.9.5"}
42-
azure-cognitiveservices-speech = "^1.37.0"
4342
microsoft-kiota-abstractions = "^1.3.2"
4443
microsoft-kiota-authentication-azure = "^1.0.0"
4544
microsoft-kiota-http = "^1.3.1"
4645
microsoft-kiota-serialization-form = "^0.1.0"
4746
microsoft-kiota-serialization-json = "^1.2.0"
4847
microsoft-kiota-serialization-multipart = "^0.1.0"
4948
microsoft-kiota-serialization-text = "^1.0.0"
49+
streamlit-audiorecorder = "^0.0.5"
5050

5151

5252
[tool.poetry.group.azure-functions.dependencies]

0 commit comments

Comments
 (0)