wip: add speech to text example

ks6088ts · ks6088ts · commit ed9863a48456 · 2024-10-08T08:09:05.000+09:00
diff --git a/.env.template b/.env.template
@@ -31,6 +31,7 @@ AZURE_BLOB_CONTAINER_NAME="audio"
 # Azure AI Speech
 AZURE_AI_SPEECH_API_ENDPOINT="https://<speech-api-name>.cognitiveservices.azure.com/"
 AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY="<speech-api-subscription-key>"
+AZURE_AI_SPEECH_API_REGION="eastus"
 
 # Bing search resource
 BING_SUBSCRIPTION_KEY="<bing-subscription-key>"
diff --git a/.gitignore b/.gitignore
@@ -167,3 +167,5 @@ generated/
 *.jpg
 *.jpeg
 .chroma
+.stop
+.transcribed.txt
diff --git a/apps/14_streamlit_azure_ai_speech/README.md b/apps/14_streamlit_azure_ai_speech/README.md
@@ -0,0 +1,11 @@
+# Streamlit Azure AI Speech
+
+```shell
+poetry run python apps/14_streamlit_azure_ai_speech/speech_to_text.py --help
+```
+
+# References
+
+- [How to recognize speech](https://learn.microsoft.com/azure/ai-services/speech-service/how-to-recognize-speech?pivots=programming-language-python)
+- [Quickstart: Create real-time diarization](https://learn.microsoft.com/azure/ai-services/speech-service/get-started-stt-diarization?tabs=windows&pivots=programming-language-python)
+- [AzureSpeechService でリアルタイム議事録](https://zenn.dev/o_ken_surprise/articles/991f5b592b91ee)
diff --git a/apps/14_streamlit_azure_ai_speech/main.py b/apps/14_streamlit_azure_ai_speech/main.py
@@ -0,0 +1,188 @@
+import pathlib
+import subprocess
+from os import getenv
+
+import streamlit as st
+from dotenv import load_dotenv
+from openai import AzureOpenAI
+
+load_dotenv()
+
+# Initialize the session state
+if "transcribed_result" not in st.session_state:
+    st.session_state["transcribed_result"] = ""
+
+with st.sidebar:
+    azure_openai_endpoint = st.text_input(
+        label="AZURE_OPENAI_ENDPOINT",
+        value=getenv("AZURE_OPENAI_ENDPOINT"),
+        key="AZURE_OPENAI_ENDPOINT",
+        type="default",
+    )
+    azure_openai_api_key = st.text_input(
+        label="AZURE_OPENAI_API_KEY",
+        value=getenv("AZURE_OPENAI_API_KEY"),
+        key="AZURE_OPENAI_API_KEY",
+        type="password",
+    )
+    azure_openai_api_version = st.text_input(
+        label="AZURE_OPENAI_API_VERSION",
+        value=getenv("AZURE_OPENAI_API_VERSION"),
+        key="AZURE_OPENAI_API_VERSION",
+        type="default",
+    )
+    azure_openai_gpt_model = st.text_input(
+        label="AZURE_OPENAI_GPT_MODEL",
+        value=getenv("AZURE_OPENAI_GPT_MODEL"),
+        key="AZURE_OPENAI_GPT_MODEL",
+        type="default",
+    )
+    azure_ai_speech_api_subscription_key = st.text_input(
+        label="AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY",
+        value=getenv("AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY"),
+        key="AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY",
+        type="password",
+    )
+    azure_ai_speech_api_region = st.text_input(
+        label="AZURE_AI_SPEECH_API_REGION",
+        value=getenv("AZURE_AI_SPEECH_API_REGION"),
+        key="AZURE_AI_SPEECH_API_REGION",
+        type="default",
+    )
+    azure_ai_speech_api_language = st.text_input(
+        label="AZURE_AI_SPEECH_API_LANGUAGE",
+        value="en-US",
+        key="AZURE_AI_SPEECH_API_LANGUAGE",
+        type="default",
+    )
+    "[Azure Portal](https://portal.azure.com/)"
+    "[Azure OpenAI Studio](https://oai.azure.com/resource/overview)"
+    "[View the source code](https://github.com/ks6088ts-labs/workshop-azure-openai/blob/main/apps/14_streamlit_azure_ai_speech/main.py)"
+
+
+def is_configured():
+    return azure_openai_api_key and azure_openai_endpoint and azure_openai_api_version and azure_openai_gpt_model
+
+
+st.title("transcribe text")
+
+if not is_configured():
+    st.warning("Please fill in the required fields at the sidebar.")
+
+st.info("This is a sample to transcribe text.")
+
+supported_tasks = [
+    "Create summaries",
+    "Do something",
+]
+
+# ---
+# 2 column layout
+
+# 1st row
+row1_left, row1_right = st.columns(2)
+with row1_left:
+    input = st.text_area(
+        "Transcribed text",
+        height=400,
+        placeholder="Please enter the text to transcribe.",
+        key="input",
+        value=st.session_state["transcribed_result"],
+    )
+
+with row1_right:
+    start_transcribe_button = st.button("start", disabled=not is_configured())
+    stop_transcribe_button = st.button("stop", disabled=not is_configured())
+    transcription_status = st.empty()
+
+# line break horizontal line
+st.markdown("---")
+
+# 2nd row
+row2_left, row2_right = st.columns(2)
+
+with row2_left:
+    target = st.selectbox(
+        "Task",
+        supported_tasks,
+        key="target",
+        index=0,
+    )
+
+with row2_right:
+    run_task_button = st.button("run_task", disabled=not is_configured())
+
+path_to_transcribed_text = ".transcribed.txt"
+
+
+def start_recognition():
+    global process
+    command = f"python apps/14_streamlit_azure_ai_speech/speech_to_text.py --output {path_to_transcribed_text} --subscription {azure_ai_speech_api_subscription_key} --region {azure_ai_speech_api_region} --language {azure_ai_speech_api_language} --verbose"  # noqa
+    process = subprocess.Popen(command, shell=True)
+
+
+def stop_recognition():
+    global process
+    if process:
+        pathlib.Path(".stop").touch()
+        process.wait()
+        process = None
+
+
+def run_task(target: str, input: str) -> str:
+    client = AzureOpenAI(
+        api_key=azure_openai_api_key,
+        api_version=azure_openai_api_version,
+        azure_endpoint=azure_openai_endpoint,
+    )
+
+    response = client.chat.completions.create(
+        model=azure_openai_gpt_model,
+        messages=[
+            {
+                "role": "system",
+                "content": f"""
+                    You are a professional translator. Please transcribe the following text into {target}.
+                    ---
+                    {input}
+                    ---
+                """,
+            },
+        ],
+    )
+    return response.choices[0].message.content
+
+
+def load_transcribed_text():
+    with open(path_to_transcribed_text) as f:
+        return f.read()
+
+
+if start_transcribe_button:
+    if not st.session_state.get("process"):
+        start_recognition()
+        transcription_status.info("音声認識を開始しました。")
+        st.success("音声認識を開始しました。")
+    else:
+        transcription_status.warning("音声認識は既に実行中です。")
+        st.warning("音声認識は既に実行中です。")
+
+if stop_transcribe_button:
+    output = load_transcribed_text()
+    st.session_state.transcribed_result = output
+    st.rerun()
+
+    if st.session_state.get("process"):
+        stop_recognition()
+        st.success("音声認識を停止しました。")
+    else:
+        st.warning("音声認識は実行されていません。")
+
+if run_task_button:
+    transcribed_text = load_transcribed_text()
+    with st.spinner("Running..."):
+        output = run_task(
+            target=target,
+            input=transcribed_text,
+        )
+        st.write(output)
diff --git a/apps/14_streamlit_azure_ai_speech/speech_to_text.py b/apps/14_streamlit_azure_ai_speech/speech_to_text.py
@@ -0,0 +1,135 @@
+import argparse
+import logging
+import os
+import time
+
+import azure.cognitiveservices.speech as speechsdk
+from dotenv import load_dotenv
+
+logger = logging.getLogger(__name__)
+
+
+outfilename = "output.txt"
+
+
+def init_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        prog="speech_to_text",
+        description="Azure AI Speech API Speech-to-Text",
+    )
+    parser.add_argument(
+        "-s",
+        "--subscription",
+        default=os.getenv("AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY"),
+        help="Azure AI Speech API subscription key",
+    )
+    parser.add_argument(
+        "-r",
+        "--region",
+        default=os.getenv("AZURE_AI_SPEECH_API_REGION"),
+        help="Azure AI Speech API region",
+    )
+    parser.add_argument(
+        "-l",
+        "--language",
+        default="en-US",
+        help="Language code for speech recognition",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        default="output.txt",
+        help="Output file path",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Set verbose mode",
+    )
+    return parser.parse_args()
+
+
+def conversation_transcriber_recognition_canceled_cb(evt: speechsdk.SessionEventArgs):
+    logger.info("Canceled event")
+
+
+def conversation_transcriber_session_stopped_cb(evt: speechsdk.SessionEventArgs):
+    logger.info("SessionStopped event")
+
+
+def conversation_transcriber_transcribed_cb(evt: speechsdk.SpeechRecognitionEventArgs):
+    logger.info("TRANSCRIBED:")
+    if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
+        logger.info(f"\tText={evt.result.text}")
+        logger.info(f"\tSpeaker ID={evt.result.speaker_id}")
+        with open(outfilename, "a") as f:
+            f.write(f"{evt.result.text}\n")
+    elif evt.result.reason == speechsdk.ResultReason.NoMatch:
+        logger.info(f"\tNOMATCH: Speech could not be TRANSCRIBED: {evt.result.no_match_details}")
+
+
+def conversation_transcriber_session_started_cb(evt: speechsdk.SessionEventArgs):
+    logger.info("SessionStarted event")
+
+
+def start_transcription(args: argparse.Namespace):
+    # FIXME: This is a workaround for setting the output file path
+    global outfilename
+    outfilename = args.output
+
+    speech_config = speechsdk.SpeechConfig(
+        subscription=args.subscription,
+        region=args.region,
+        speech_recognition_language=args.language,
+    )
+
+    conversation_transcriber = speechsdk.transcription.ConversationTranscriber(
+        speech_config=speech_config,
+    )
+
+    transcribing_stop = False
+
+    def stop_cb(evt: speechsdk.SessionEventArgs):
+        # """callback that signals to stop continuous recognition upon receiving an event `evt`"""
+        logger.info(f"CLOSING on {evt}")
+        nonlocal transcribing_stop
+        transcribing_stop = True
+
+    # Connect callbacks to the events fired by the conversation transcriber
+    conversation_transcriber.transcribed.connect(conversation_transcriber_transcribed_cb)
+    conversation_transcriber.session_started.connect(conversation_transcriber_session_started_cb)
+    conversation_transcriber.session_stopped.connect(conversation_transcriber_session_stopped_cb)
+    conversation_transcriber.canceled.connect(conversation_transcriber_recognition_canceled_cb)
+    # stop transcribing on either session stopped or canceled events
+    conversation_transcriber.session_stopped.connect(stop_cb)
+    conversation_transcriber.canceled.connect(stop_cb)
+
+    conversation_transcriber.start_transcribing_async()
+
+    # Waits for completion.
+    while not transcribing_stop:
+        if os.path.exists(".stop"):
+            logger.info("終了フラグが検出されました。音声認識を終了します。")
+            conversation_transcriber.stop_transcribing_async()
+            os.remove(".stop")
+            break
+        time.sleep(0.5)
+
+    conversation_transcriber.stop_transcribing_async()
+
+
+if __name__ == "__main__":
+    args = init_args()
+
+    # Set verbose mode
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+
+    # Parse .env file and set environment variables
+    load_dotenv()
+
+    try:
+        start_transcription(args=args)
+    except Exception as err:
+        logger.info(f"Encountered exception. {err}")
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml

-Original file line number
+Diff line change
 *.jpg
 *.jpeg
 .chroma
 +.stop
 +.transcribed.txt