|
1 | 1 | import logging |
2 | | -import time |
3 | | -from os import getenv |
| 2 | +from os import getenv, makedirs |
4 | 3 |
|
5 | | -import azure.cognitiveservices.speech as speechsdk |
6 | 4 | import streamlit as st |
7 | | -from azure.cognitiveservices.speech.speech import SpeechRecognitionEventArgs |
| 5 | +from audiorecorder import audiorecorder |
8 | 6 | from dotenv import load_dotenv |
| 7 | +from openai import AzureOpenAI |
| 8 | +from openai.types.audio import Transcription |
9 | 9 |
|
10 | | -load_dotenv("azure_ai_speech.env") |
| 10 | +load_dotenv("frontend_transcription.env") |
11 | 11 | logger = logging.getLogger(__name__) |
12 | | -done = False |
13 | 12 |
|
14 | 13 |
|
15 | | -def transcript( |
16 | | - subscription: str, |
17 | | - region: str, |
18 | | - speech_recognition_language: str, |
19 | | -): |
20 | | - speech_recognizer = speechsdk.SpeechRecognizer( |
21 | | - speech_config=speechsdk.SpeechConfig( |
22 | | - subscription=subscription, |
23 | | - region=region, |
24 | | - speech_recognition_language=speech_recognition_language, |
25 | | - ), |
26 | | - audio_config=speechsdk.audio.AudioConfig( |
27 | | - use_default_microphone=True, |
28 | | - ), |
| 14 | +# TODO: call backend API instead of using Azure OpenAI |
| 15 | +def get_transcription(file_path: str) -> Transcription: |
| 16 | + client = AzureOpenAI( |
| 17 | + api_key=getenv("AZURE_OPENAI_API_KEY"), |
| 18 | + api_version=getenv("AZURE_OPENAI_API_VERSION"), |
| 19 | + azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"), |
29 | 20 | ) |
30 | 21 |
|
31 | | - def stop_cb(evt: SpeechRecognitionEventArgs): |
32 | | - logger.debug(f"CLOSING on {evt}") |
33 | | - speech_recognizer.stop_continuous_recognition() |
34 | | - |
35 | | - def recognized_cb(evt: SpeechRecognitionEventArgs): |
36 | | - logger.debug(f"RECOGNIZED: {evt}") |
37 | | - new_text = evt.result.text.strip() |
38 | | - logger.info(new_text) |
39 | | - # FIXME: App does not show the transcription |
40 | | - |
41 | | - speech_recognizer.recognizing.connect(lambda evt: logger.debug(f"RECOGNIZING: {evt}")) |
42 | | - speech_recognizer.recognized.connect(recognized_cb) |
43 | | - speech_recognizer.session_started.connect(lambda evt: logger.debug(f"SESSION STARTED: {evt}")) |
44 | | - speech_recognizer.session_stopped.connect(lambda evt: logger.debug(f"SESSION STOPPED {evt}")) |
45 | | - speech_recognizer.canceled.connect(lambda evt: logger.debug(f"CANCELED {evt}")) |
46 | | - speech_recognizer.session_stopped.connect(stop_cb) |
47 | | - speech_recognizer.canceled.connect(stop_cb) |
48 | | - |
49 | | - speech_recognizer.start_continuous_recognition() |
50 | | - |
51 | | - global done |
52 | | - |
53 | | - if st.button("Stop transcription", key="stop_transcription"): |
54 | | - # FIXME: App does not stop transcription |
55 | | - logger.info("Stop transcription") |
56 | | - speech_recognizer.stop_continuous_recognition() |
57 | | - done = True |
58 | | - |
59 | | - while done is False: |
60 | | - time.sleep(0.5) |
| 22 | + return client.audio.transcriptions.create( |
| 23 | + file=open(file=file_path, mode="rb"), |
| 24 | + model=getenv("AZURE_OPENAI_WHISPER_MODEL"), |
| 25 | + ) |
61 | 26 |
|
62 | 27 |
|
63 | 28 | def start( |
64 | 29 | backend_url: str, |
65 | 30 | log_level: int, |
66 | 31 | ): |
67 | | - global done |
68 | | - |
| 32 | + # Logger |
69 | 33 | logger.setLevel(log_level) |
70 | 34 | logger.debug(f"set log level to {log_level}") |
71 | 35 |
|
72 | 36 | st.write("Transcription") |
73 | 37 |
|
74 | | - if st.button("Start transcription", key="start_transcription"): |
75 | | - logger.info("Start transcription...") |
76 | | - done = False |
77 | | - try: |
78 | | - with st.spinner("Transcribing..."): |
79 | | - transcript( |
80 | | - subscription=getenv("AZURE_AI_SPEECH_SUBSCRIPTION_KEY"), |
81 | | - region=getenv("AZURE_AI_SPEECH_REGION"), |
82 | | - speech_recognition_language=getenv("AZURE_AI_SPEECH_RECOGNITION_LANGUAGE"), |
83 | | - ) |
84 | | - except Exception as e: |
85 | | - st.write(f"Error: {e}") |
86 | | - logger.error(f"Error: {e}") |
| 38 | + # create directory if not exists |
| 39 | + # TODO: remove hard coded path |
| 40 | + makedirs("artifacts", exist_ok=True) |
| 41 | + |
| 42 | + # Audio settings |
| 43 | + audio_file_path = "artifacts/audio.wav" |
| 44 | + |
| 45 | + audio = audiorecorder( |
| 46 | + start_prompt="", |
| 47 | + stop_prompt="", |
| 48 | + pause_prompt="", |
| 49 | + show_visualizer=True, |
| 50 | + key=None, |
| 51 | + ) |
| 52 | + |
| 53 | + if len(audio) > 0: |
| 54 | + # To play audio in frontend: |
| 55 | + st.audio(audio.export().read()) |
| 56 | + # To save audio to a file, use pydub export method: |
| 57 | + audio.export(audio_file_path, format="wav") |
| 58 | + # To get audio properties, use pydub AudioSegment properties: |
| 59 | + st.write( |
| 60 | + f"Frame rate: {audio.frame_rate}, Frame width: {audio.frame_width}, Duration: {audio.duration_seconds} seconds" # noqa |
| 61 | + ) |
| 62 | + with st.spinner("Transcribing..."): |
| 63 | + # Get transcription |
| 64 | + transcription = get_transcription(audio_file_path) |
| 65 | + st.write(f"Transcription: {transcription.text}") |
0 commit comments