Skip to content

Commit ed9863a

Browse files
committed
wip: add speech to text example
1 parent c2adbf6 commit ed9863a

File tree

7 files changed

+359
-6
lines changed

7 files changed

+359
-6
lines changed

.env.template

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ AZURE_BLOB_CONTAINER_NAME="audio"
3131
# Azure AI Speech
3232
AZURE_AI_SPEECH_API_ENDPOINT="https://<speech-api-name>.cognitiveservices.azure.com/"
3333
AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY="<speech-api-subscription-key>"
34+
AZURE_AI_SPEECH_API_REGION="eastus"
3435

3536
# Bing search resource
3637
BING_SUBSCRIPTION_KEY="<bing-subscription-key>"

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,3 +167,5 @@ generated/
167167
*.jpg
168168
*.jpeg
169169
.chroma
170+
.stop
171+
.transcribed.txt
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Streamlit Azure AI Speech
2+
3+
```shell
4+
poetry run python apps/14_streamlit_azure_ai_speech/speech_to_text.py --help
5+
```
6+
7+
# References
8+
9+
- [How to recognize speech](https://learn.microsoft.com/azure/ai-services/speech-service/how-to-recognize-speech?pivots=programming-language-python)
10+
- [Quickstart: Create real-time diarization](https://learn.microsoft.com/azure/ai-services/speech-service/get-started-stt-diarization?tabs=windows&pivots=programming-language-python)
11+
- [AzureSpeechService でリアルタイム議事録](https://zenn.dev/o_ken_surprise/articles/991f5b592b91ee)
Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
import pathlib
2+
import subprocess
3+
from os import getenv
4+
5+
import streamlit as st
6+
from dotenv import load_dotenv
7+
from openai import AzureOpenAI
8+
9+
load_dotenv()
10+
11+
# Initialize the session state
12+
if "transcribed_result" not in st.session_state:
13+
st.session_state["transcribed_result"] = ""
14+
15+
with st.sidebar:
16+
azure_openai_endpoint = st.text_input(
17+
label="AZURE_OPENAI_ENDPOINT",
18+
value=getenv("AZURE_OPENAI_ENDPOINT"),
19+
key="AZURE_OPENAI_ENDPOINT",
20+
type="default",
21+
)
22+
azure_openai_api_key = st.text_input(
23+
label="AZURE_OPENAI_API_KEY",
24+
value=getenv("AZURE_OPENAI_API_KEY"),
25+
key="AZURE_OPENAI_API_KEY",
26+
type="password",
27+
)
28+
azure_openai_api_version = st.text_input(
29+
label="AZURE_OPENAI_API_VERSION",
30+
value=getenv("AZURE_OPENAI_API_VERSION"),
31+
key="AZURE_OPENAI_API_VERSION",
32+
type="default",
33+
)
34+
azure_openai_gpt_model = st.text_input(
35+
label="AZURE_OPENAI_GPT_MODEL",
36+
value=getenv("AZURE_OPENAI_GPT_MODEL"),
37+
key="AZURE_OPENAI_GPT_MODEL",
38+
type="default",
39+
)
40+
azure_ai_speech_api_subscription_key = st.text_input(
41+
label="AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY",
42+
value=getenv("AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY"),
43+
key="AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY",
44+
type="password",
45+
)
46+
azure_ai_speech_api_region = st.text_input(
47+
label="AZURE_AI_SPEECH_API_REGION",
48+
value=getenv("AZURE_AI_SPEECH_API_REGION"),
49+
key="AZURE_AI_SPEECH_API_REGION",
50+
type="default",
51+
)
52+
azure_ai_speech_api_language = st.text_input(
53+
label="AZURE_AI_SPEECH_API_LANGUAGE",
54+
value="en-US",
55+
key="AZURE_AI_SPEECH_API_LANGUAGE",
56+
type="default",
57+
)
58+
"[Azure Portal](https://portal.azure.com/)"
59+
"[Azure OpenAI Studio](https://oai.azure.com/resource/overview)"
60+
"[View the source code](https://github.com/ks6088ts-labs/workshop-azure-openai/blob/main/apps/14_streamlit_azure_ai_speech/main.py)"
61+
62+
63+
def is_configured():
64+
return azure_openai_api_key and azure_openai_endpoint and azure_openai_api_version and azure_openai_gpt_model
65+
66+
67+
st.title("transcribe text")
68+
69+
if not is_configured():
70+
st.warning("Please fill in the required fields at the sidebar.")
71+
72+
st.info("This is a sample to transcribe text.")
73+
74+
supported_tasks = [
75+
"Create summaries",
76+
"Do something",
77+
]
78+
79+
# ---
80+
# 2 column layout
81+
82+
# 1st row
83+
row1_left, row1_right = st.columns(2)
84+
with row1_left:
85+
input = st.text_area(
86+
"Transcribed text",
87+
height=400,
88+
placeholder="Please enter the text to transcribe.",
89+
key="input",
90+
value=st.session_state["transcribed_result"],
91+
)
92+
93+
with row1_right:
94+
start_transcribe_button = st.button("start", disabled=not is_configured())
95+
stop_transcribe_button = st.button("stop", disabled=not is_configured())
96+
transcription_status = st.empty()
97+
98+
# line break horizontal line
99+
st.markdown("---")
100+
101+
# 2nd row
102+
row2_left, row2_right = st.columns(2)
103+
104+
with row2_left:
105+
target = st.selectbox(
106+
"Task",
107+
supported_tasks,
108+
key="target",
109+
index=0,
110+
)
111+
112+
with row2_right:
113+
run_task_button = st.button("run_task", disabled=not is_configured())
114+
115+
path_to_transcribed_text = ".transcribed.txt"
116+
117+
118+
def start_recognition():
119+
global process
120+
command = f"python apps/14_streamlit_azure_ai_speech/speech_to_text.py --output {path_to_transcribed_text} --subscription {azure_ai_speech_api_subscription_key} --region {azure_ai_speech_api_region} --language {azure_ai_speech_api_language} --verbose" # noqa
121+
process = subprocess.Popen(command, shell=True)
122+
123+
124+
def stop_recognition():
125+
global process
126+
if process:
127+
pathlib.Path(".stop").touch()
128+
process.wait()
129+
process = None
130+
131+
132+
def run_task(target: str, input: str) -> str:
133+
client = AzureOpenAI(
134+
api_key=azure_openai_api_key,
135+
api_version=azure_openai_api_version,
136+
azure_endpoint=azure_openai_endpoint,
137+
)
138+
139+
response = client.chat.completions.create(
140+
model=azure_openai_gpt_model,
141+
messages=[
142+
{
143+
"role": "system",
144+
"content": f"""
145+
You are a professional translator. Please transcribe the following text into {target}.
146+
---
147+
{input}
148+
---
149+
""",
150+
},
151+
],
152+
)
153+
return response.choices[0].message.content
154+
155+
156+
def load_transcribed_text():
157+
with open(path_to_transcribed_text) as f:
158+
return f.read()
159+
160+
161+
if start_transcribe_button:
162+
if not st.session_state.get("process"):
163+
start_recognition()
164+
transcription_status.info("音声認識を開始しました。")
165+
st.success("音声認識を開始しました。")
166+
else:
167+
transcription_status.warning("音声認識は既に実行中です。")
168+
st.warning("音声認識は既に実行中です。")
169+
170+
if stop_transcribe_button:
171+
output = load_transcribed_text()
172+
st.session_state.transcribed_result = output
173+
st.rerun()
174+
175+
if st.session_state.get("process"):
176+
stop_recognition()
177+
st.success("音声認識を停止しました。")
178+
else:
179+
st.warning("音声認識は実行されていません。")
180+
181+
if run_task_button:
182+
transcribed_text = load_transcribed_text()
183+
with st.spinner("Running..."):
184+
output = run_task(
185+
target=target,
186+
input=transcribed_text,
187+
)
188+
st.write(output)
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import argparse
2+
import logging
3+
import os
4+
import time
5+
6+
import azure.cognitiveservices.speech as speechsdk
7+
from dotenv import load_dotenv
8+
9+
logger = logging.getLogger(__name__)
10+
11+
12+
outfilename = "output.txt"
13+
14+
15+
def init_args() -> argparse.Namespace:
16+
parser = argparse.ArgumentParser(
17+
prog="speech_to_text",
18+
description="Azure AI Speech API Speech-to-Text",
19+
)
20+
parser.add_argument(
21+
"-s",
22+
"--subscription",
23+
default=os.getenv("AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY"),
24+
help="Azure AI Speech API subscription key",
25+
)
26+
parser.add_argument(
27+
"-r",
28+
"--region",
29+
default=os.getenv("AZURE_AI_SPEECH_API_REGION"),
30+
help="Azure AI Speech API region",
31+
)
32+
parser.add_argument(
33+
"-l",
34+
"--language",
35+
default="en-US",
36+
help="Language code for speech recognition",
37+
)
38+
parser.add_argument(
39+
"-o",
40+
"--output",
41+
default="output.txt",
42+
help="Output file path",
43+
)
44+
parser.add_argument(
45+
"-v",
46+
"--verbose",
47+
action="store_true",
48+
help="Set verbose mode",
49+
)
50+
return parser.parse_args()
51+
52+
53+
def conversation_transcriber_recognition_canceled_cb(evt: speechsdk.SessionEventArgs):
54+
logger.info("Canceled event")
55+
56+
57+
def conversation_transcriber_session_stopped_cb(evt: speechsdk.SessionEventArgs):
58+
logger.info("SessionStopped event")
59+
60+
61+
def conversation_transcriber_transcribed_cb(evt: speechsdk.SpeechRecognitionEventArgs):
62+
logger.info("TRANSCRIBED:")
63+
if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
64+
logger.info(f"\tText={evt.result.text}")
65+
logger.info(f"\tSpeaker ID={evt.result.speaker_id}")
66+
with open(outfilename, "a") as f:
67+
f.write(f"{evt.result.text}\n")
68+
elif evt.result.reason == speechsdk.ResultReason.NoMatch:
69+
logger.info(f"\tNOMATCH: Speech could not be TRANSCRIBED: {evt.result.no_match_details}")
70+
71+
72+
def conversation_transcriber_session_started_cb(evt: speechsdk.SessionEventArgs):
73+
logger.info("SessionStarted event")
74+
75+
76+
def start_transcription(args: argparse.Namespace):
77+
# FIXME: This is a workaround for setting the output file path
78+
global outfilename
79+
outfilename = args.output
80+
81+
speech_config = speechsdk.SpeechConfig(
82+
subscription=args.subscription,
83+
region=args.region,
84+
speech_recognition_language=args.language,
85+
)
86+
87+
conversation_transcriber = speechsdk.transcription.ConversationTranscriber(
88+
speech_config=speech_config,
89+
)
90+
91+
transcribing_stop = False
92+
93+
def stop_cb(evt: speechsdk.SessionEventArgs):
94+
# """callback that signals to stop continuous recognition upon receiving an event `evt`"""
95+
logger.info(f"CLOSING on {evt}")
96+
nonlocal transcribing_stop
97+
transcribing_stop = True
98+
99+
# Connect callbacks to the events fired by the conversation transcriber
100+
conversation_transcriber.transcribed.connect(conversation_transcriber_transcribed_cb)
101+
conversation_transcriber.session_started.connect(conversation_transcriber_session_started_cb)
102+
conversation_transcriber.session_stopped.connect(conversation_transcriber_session_stopped_cb)
103+
conversation_transcriber.canceled.connect(conversation_transcriber_recognition_canceled_cb)
104+
# stop transcribing on either session stopped or canceled events
105+
conversation_transcriber.session_stopped.connect(stop_cb)
106+
conversation_transcriber.canceled.connect(stop_cb)
107+
108+
conversation_transcriber.start_transcribing_async()
109+
110+
# Waits for completion.
111+
while not transcribing_stop:
112+
if os.path.exists(".stop"):
113+
logger.info("終了フラグが検出されました。音声認識を終了します。")
114+
conversation_transcriber.stop_transcribing_async()
115+
os.remove(".stop")
116+
break
117+
time.sleep(0.5)
118+
119+
conversation_transcriber.stop_transcribing_async()
120+
121+
122+
if __name__ == "__main__":
123+
args = init_args()
124+
125+
# Set verbose mode
126+
if args.verbose:
127+
logging.basicConfig(level=logging.DEBUG)
128+
129+
# Parse .env file and set environment variables
130+
load_dotenv()
131+
132+
try:
133+
start_transcription(args=args)
134+
except Exception as err:
135+
logger.info(f"Encountered exception. {err}")

0 commit comments

Comments
 (0)