Skip to content

Commit a095fa9

Browse files
committed
wip: add ai speech sample
1 parent c2adbf6 commit a095fa9

File tree

7 files changed

+408
-6
lines changed

7 files changed

+408
-6
lines changed

.env.template

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ AZURE_BLOB_CONTAINER_NAME="audio"
3131
# Azure AI Speech
3232
AZURE_AI_SPEECH_API_ENDPOINT="https://<speech-api-name>.cognitiveservices.azure.com/"
3333
AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY="<speech-api-subscription-key>"
34+
AZURE_AI_SPEECH_API_REGION="eastus"
3435

3536
# Bing search resource
3637
BING_SUBSCRIPTION_KEY="<bing-subscription-key>"

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,3 +167,5 @@ generated/
167167
*.jpg
168168
*.jpeg
169169
.chroma
170+
.stop
171+
.transcribed.txt
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Streamlit Azure AI Speech
2+
3+
```shell
4+
# Speech to Text script
5+
poetry run python apps/14_streamlit_azure_ai_speech/speech_to_text.py --help
6+
7+
# Streamlit app
8+
poetry run python -m streamlit run apps/14_streamlit_azure_ai_speech/main.py
9+
```
10+
11+
# References
12+
13+
- [How to recognize speech](https://learn.microsoft.com/azure/ai-services/speech-service/how-to-recognize-speech?pivots=programming-language-python)
14+
- [Quickstart: Create real-time diarization](https://learn.microsoft.com/azure/ai-services/speech-service/get-started-stt-diarization?tabs=windows&pivots=programming-language-python)
15+
- [Speech to text containers with Docker](https://learn.microsoft.com/azure/ai-services/speech-service/speech-container-stt?tabs=container&pivots=programming-language-python)
16+
- [AzureSpeechService でリアルタイム議事録](https://zenn.dev/o_ken_surprise/articles/991f5b592b91ee)
Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
import pathlib
2+
import subprocess
3+
from os import getenv
4+
5+
import streamlit as st
6+
from dotenv import load_dotenv
7+
from openai import AzureOpenAI
8+
9+
load_dotenv()
10+
11+
# Initialize the session state
12+
if "transcribed_result" not in st.session_state:
13+
st.session_state["transcribed_result"] = ""
14+
15+
with st.sidebar:
16+
inference_type = st.selectbox(
17+
label="INEFERENCE_TYPE",
18+
options=[
19+
"azure",
20+
"local",
21+
],
22+
key="INEFERENCE_TYPE",
23+
)
24+
azure_ai_speech_api_language = st.selectbox(
25+
label="AZURE_AI_SPEECH_API_LANGUAGE",
26+
options=[
27+
"en-US",
28+
"ja-JP",
29+
],
30+
key="AZURE_AI_SPEECH_API_LANGUAGE",
31+
)
32+
if inference_type == "local":
33+
path_to_model = st.text_input(
34+
label="PATH_TO_MODEL",
35+
value="./model",
36+
key="PATH_TO_MODEL",
37+
type="default",
38+
)
39+
stt_host = st.text_input(
40+
label="STT_HOST",
41+
value="ws://localhost:5000",
42+
key="STT_HOST",
43+
type="default",
44+
)
45+
st.warning("yet to be implemented")
46+
if inference_type == "azure":
47+
azure_openai_endpoint = st.text_input(
48+
label="AZURE_OPENAI_ENDPOINT",
49+
value=getenv("AZURE_OPENAI_ENDPOINT"),
50+
key="AZURE_OPENAI_ENDPOINT",
51+
type="default",
52+
)
53+
azure_openai_api_key = st.text_input(
54+
label="AZURE_OPENAI_API_KEY",
55+
value=getenv("AZURE_OPENAI_API_KEY"),
56+
key="AZURE_OPENAI_API_KEY",
57+
type="password",
58+
)
59+
azure_openai_api_version = st.text_input(
60+
label="AZURE_OPENAI_API_VERSION",
61+
value=getenv("AZURE_OPENAI_API_VERSION"),
62+
key="AZURE_OPENAI_API_VERSION",
63+
type="default",
64+
)
65+
azure_openai_gpt_model = st.text_input(
66+
label="AZURE_OPENAI_GPT_MODEL",
67+
value=getenv("AZURE_OPENAI_GPT_MODEL"),
68+
key="AZURE_OPENAI_GPT_MODEL",
69+
type="default",
70+
)
71+
azure_ai_speech_api_subscription_key = st.text_input(
72+
label="AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY",
73+
value=getenv("AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY"),
74+
key="AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY",
75+
type="password",
76+
)
77+
azure_ai_speech_api_region = st.text_input(
78+
label="AZURE_AI_SPEECH_API_REGION",
79+
value=getenv("AZURE_AI_SPEECH_API_REGION"),
80+
key="AZURE_AI_SPEECH_API_REGION",
81+
type="default",
82+
)
83+
"[Azure Portal](https://portal.azure.com/)"
84+
"[Azure OpenAI Studio](https://oai.azure.com/resource/overview)"
85+
"[View the source code](https://github.com/ks6088ts-labs/workshop-azure-openai/blob/main/apps/14_streamlit_azure_ai_speech/main.py)"
86+
87+
88+
def is_configured():
89+
if inference_type == "local":
90+
return path_to_model and stt_host
91+
if inference_type == "azure":
92+
return azure_openai_api_key and azure_openai_endpoint and azure_openai_api_version and azure_openai_gpt_model
93+
94+
95+
st.title("transcribe text")
96+
97+
if not is_configured():
98+
st.warning("Please fill in the required fields at the sidebar.")
99+
100+
st.info("This is a sample to transcribe text.")
101+
102+
# ---
103+
# 2 column layout
104+
105+
# 1st row
106+
row1_left, row1_right = st.columns(2)
107+
with row1_left:
108+
input = st.text_area(
109+
"Transcribed text",
110+
height=400,
111+
placeholder="Please enter the text to transcribe.",
112+
key="input",
113+
value=st.session_state["transcribed_result"],
114+
)
115+
116+
with row1_right:
117+
start_transcribe_button = st.button("start", disabled=not is_configured())
118+
stop_transcribe_button = st.button("stop", disabled=not is_configured())
119+
transcription_status = st.empty()
120+
121+
# line break horizontal line
122+
st.markdown("---")
123+
124+
# 2nd row
125+
row2_left, row2_right = st.columns(2)
126+
127+
with row2_left:
128+
selected_task = st.selectbox(
129+
"Task",
130+
[
131+
"Create summaries from the following text",
132+
"Extract 3 main points from the following text",
133+
# Add more tasks here
134+
],
135+
key="selected_task",
136+
index=0,
137+
)
138+
139+
with row2_right:
140+
run_task_button = st.button("run_task", disabled=not is_configured())
141+
142+
path_to_transcribed_text = ".transcribed.txt"
143+
144+
145+
def start_recognition():
146+
global process
147+
if inference_type == "local":
148+
command = f"python apps/14_streamlit_azure_ai_speech/speech_to_text.py --output {path_to_transcribed_text} --endpoint {stt_host} --language {azure_ai_speech_api_language} --type local --verbose" # noqa
149+
process = subprocess.Popen(command, shell=True)
150+
st.warning("Local inference is not yet implemented.")
151+
return
152+
if inference_type == "azure":
153+
command = f"python apps/14_streamlit_azure_ai_speech/speech_to_text.py --output {path_to_transcribed_text} --subscription {azure_ai_speech_api_subscription_key} --region {azure_ai_speech_api_region} --language {azure_ai_speech_api_language} --type azure --verbose" # noqa
154+
process = subprocess.Popen(command, shell=True)
155+
156+
157+
def run_task(selected_task: str, input: str) -> str:
158+
if inference_type == "local":
159+
st.warning("Local inference is not yet implemented.")
160+
return
161+
if inference_type == "azure":
162+
client = AzureOpenAI(
163+
api_key=azure_openai_api_key,
164+
api_version=azure_openai_api_version,
165+
azure_endpoint=azure_openai_endpoint,
166+
)
167+
168+
response = client.chat.completions.create(
169+
model=azure_openai_gpt_model,
170+
messages=[
171+
{
172+
"role": "system",
173+
"content": f"""
174+
Task: {selected_task}.
175+
---
176+
{input}
177+
---
178+
""",
179+
},
180+
],
181+
)
182+
return response.choices[0].message.content
183+
raise ValueError(f"Inference type is not supported: {inference_type}")
184+
185+
186+
def load_transcribed_text():
187+
with open(path_to_transcribed_text) as f:
188+
return f.read()
189+
190+
191+
if start_transcribe_button:
192+
if not st.session_state.get("process"):
193+
transcription_status.info(f"Transcribing... (language={azure_ai_speech_api_language})")
194+
start_recognition()
195+
else:
196+
transcription_status.warning("Transcription is already running.")
197+
198+
if stop_transcribe_button:
199+
pathlib.Path(".stop").touch()
200+
output = load_transcribed_text()
201+
st.session_state.transcribed_result = output
202+
st.rerun()
203+
204+
if run_task_button:
205+
with st.spinner("Running..."):
206+
output = run_task(
207+
selected_task=selected_task,
208+
input=input,
209+
)
210+
st.write(output)
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
import argparse
2+
import logging
3+
import os
4+
import time
5+
6+
import azure.cognitiveservices.speech as speechsdk
7+
from dotenv import load_dotenv
8+
9+
logger = logging.getLogger(__name__)
10+
11+
12+
outfilename = "output.txt"
13+
14+
15+
def init_args() -> argparse.Namespace:
16+
parser = argparse.ArgumentParser(
17+
prog="speech_to_text",
18+
description="Azure AI Speech API Speech-to-Text",
19+
)
20+
parser.add_argument(
21+
"-t",
22+
"--type",
23+
default="azure",
24+
help="Inference type, either 'local' or 'azure'",
25+
)
26+
parser.add_argument(
27+
"-e",
28+
"--endpoint",
29+
default="ws://localhost:5000",
30+
help="Host address for local inference",
31+
)
32+
parser.add_argument(
33+
"-s",
34+
"--subscription",
35+
default=os.getenv("AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY"),
36+
help="Azure AI Speech API subscription key",
37+
)
38+
parser.add_argument(
39+
"-r",
40+
"--region",
41+
default=os.getenv("AZURE_AI_SPEECH_API_REGION"),
42+
help="Azure AI Speech API region",
43+
)
44+
parser.add_argument(
45+
"-l",
46+
"--language",
47+
default="en-US",
48+
help="Language code for speech recognition",
49+
)
50+
parser.add_argument(
51+
"-o",
52+
"--output",
53+
default="output.txt",
54+
help="Output file path",
55+
)
56+
parser.add_argument(
57+
"-v",
58+
"--verbose",
59+
action="store_true",
60+
help="Set verbose mode",
61+
)
62+
return parser.parse_args()
63+
64+
65+
def conversation_transcriber_recognition_canceled_cb(evt: speechsdk.SessionEventArgs):
66+
logger.info("Canceled event")
67+
68+
69+
def conversation_transcriber_session_stopped_cb(evt: speechsdk.SessionEventArgs):
70+
logger.info("SessionStopped event")
71+
72+
73+
def conversation_transcriber_transcribed_cb(evt: speechsdk.SpeechRecognitionEventArgs):
74+
logger.info("TRANSCRIBED:")
75+
if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
76+
logger.info(f"\tText={evt.result.text}")
77+
logger.info(f"\tSpeaker ID={evt.result.speaker_id}")
78+
if evt.result.text != "":
79+
with open(outfilename, "a") as f:
80+
f.write(f"{evt.result.text}\n")
81+
elif evt.result.reason == speechsdk.ResultReason.NoMatch:
82+
logger.info(f"\tNOMATCH: Speech could not be TRANSCRIBED: {evt.result.no_match_details}")
83+
84+
85+
def conversation_transcriber_session_started_cb(evt: speechsdk.SessionEventArgs):
86+
logger.info("SessionStarted event")
87+
88+
89+
def start_transcription(args: argparse.Namespace):
90+
# FIXME: This is a workaround for setting the output file path
91+
global outfilename
92+
outfilename = args.output
93+
94+
speech_config = None
95+
if args.type == "local":
96+
speech_config = speechsdk.SpeechConfig(
97+
host=args.endpoint,
98+
speech_recognition_language=args.language,
99+
)
100+
if args.type == "azure":
101+
speech_config = speechsdk.SpeechConfig(
102+
subscription=args.subscription,
103+
region=args.region,
104+
speech_recognition_language=args.language,
105+
)
106+
if not speech_config:
107+
raise ValueError(f"Invalid inference type: {args.type}")
108+
109+
conversation_transcriber = speechsdk.transcription.ConversationTranscriber(
110+
speech_config=speech_config,
111+
)
112+
113+
transcribing_stop = False
114+
115+
def stop_cb(evt: speechsdk.SessionEventArgs):
116+
# """callback that signals to stop continuous recognition upon receiving an event `evt`"""
117+
logger.info(f"CLOSING on {evt}")
118+
nonlocal transcribing_stop
119+
transcribing_stop = True
120+
121+
# Connect callbacks to the events fired by the conversation transcriber
122+
conversation_transcriber.transcribed.connect(conversation_transcriber_transcribed_cb)
123+
conversation_transcriber.session_started.connect(conversation_transcriber_session_started_cb)
124+
conversation_transcriber.session_stopped.connect(conversation_transcriber_session_stopped_cb)
125+
conversation_transcriber.canceled.connect(conversation_transcriber_recognition_canceled_cb)
126+
# stop transcribing on either session stopped or canceled events
127+
conversation_transcriber.session_stopped.connect(stop_cb)
128+
conversation_transcriber.canceled.connect(stop_cb)
129+
130+
conversation_transcriber.start_transcribing_async()
131+
132+
# Waits for completion.
133+
while not transcribing_stop:
134+
if os.path.exists(".stop"):
135+
logger.info("Stopping transcription...")
136+
conversation_transcriber.stop_transcribing_async()
137+
os.remove(".stop")
138+
break
139+
time.sleep(0.5)
140+
141+
conversation_transcriber.stop_transcribing_async()
142+
143+
144+
if __name__ == "__main__":
145+
args = init_args()
146+
147+
# Set verbose mode
148+
if args.verbose:
149+
logging.basicConfig(level=logging.DEBUG)
150+
151+
# Parse .env file and set environment variables
152+
load_dotenv()
153+
154+
try:
155+
start_transcription(args=args)
156+
except Exception as err:
157+
logger.info(f"Encountered exception. {err}")

0 commit comments

Comments
 (0)