wip: add ai speech sample

ks6088ts · ks6088ts · commit a095fa97eb0b · 2024-10-08T10:09:24.000+09:00
diff --git a/.env.template b/.env.template
@@ -31,6 +31,7 @@ AZURE_BLOB_CONTAINER_NAME="audio"
 # Azure AI Speech
 AZURE_AI_SPEECH_API_ENDPOINT="https://<speech-api-name>.cognitiveservices.azure.com/"
 AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY="<speech-api-subscription-key>"
+AZURE_AI_SPEECH_API_REGION="eastus"
 
 # Bing search resource
 BING_SUBSCRIPTION_KEY="<bing-subscription-key>"
diff --git a/.gitignore b/.gitignore
@@ -167,3 +167,5 @@ generated/
 *.jpg
 *.jpeg
 .chroma
+.stop
+.transcribed.txt
diff --git a/apps/14_streamlit_azure_ai_speech/README.md b/apps/14_streamlit_azure_ai_speech/README.md
@@ -0,0 +1,16 @@
+# Streamlit Azure AI Speech
+
+```shell
+# Speech to Text script
+poetry run python apps/14_streamlit_azure_ai_speech/speech_to_text.py --help
+
+# Streamlit app
+poetry run python -m streamlit run apps/14_streamlit_azure_ai_speech/main.py
+```
+
+# References
+
+- [How to recognize speech](https://learn.microsoft.com/azure/ai-services/speech-service/how-to-recognize-speech?pivots=programming-language-python)
+- [Quickstart: Create real-time diarization](https://learn.microsoft.com/azure/ai-services/speech-service/get-started-stt-diarization?tabs=windows&pivots=programming-language-python)
+- [Speech to text containers with Docker](https://learn.microsoft.com/azure/ai-services/speech-service/speech-container-stt?tabs=container&pivots=programming-language-python)
+- [AzureSpeechService でリアルタイム議事録](https://zenn.dev/o_ken_surprise/articles/991f5b592b91ee)
diff --git a/apps/14_streamlit_azure_ai_speech/main.py b/apps/14_streamlit_azure_ai_speech/main.py
@@ -0,0 +1,210 @@
+import pathlib
+import subprocess
+from os import getenv
+
+import streamlit as st
+from dotenv import load_dotenv
+from openai import AzureOpenAI
+
+load_dotenv()
+
+# Initialize the session state
+if "transcribed_result" not in st.session_state:
+    st.session_state["transcribed_result"] = ""
+
+with st.sidebar:
+    inference_type = st.selectbox(
+        label="INEFERENCE_TYPE",
+        options=[
+            "azure",
+            "local",
+        ],
+        key="INEFERENCE_TYPE",
+    )
+    azure_ai_speech_api_language = st.selectbox(
+        label="AZURE_AI_SPEECH_API_LANGUAGE",
+        options=[
+            "en-US",
+            "ja-JP",
+        ],
+        key="AZURE_AI_SPEECH_API_LANGUAGE",
+    )
+    if inference_type == "local":
+        path_to_model = st.text_input(
+            label="PATH_TO_MODEL",
+            value="./model",
+            key="PATH_TO_MODEL",
+            type="default",
+        )
+        stt_host = st.text_input(
+            label="STT_HOST",
+            value="ws://localhost:5000",
+            key="STT_HOST",
+            type="default",
+        )
+        st.warning("yet to be implemented")
+    if inference_type == "azure":
+        azure_openai_endpoint = st.text_input(
+            label="AZURE_OPENAI_ENDPOINT",
+            value=getenv("AZURE_OPENAI_ENDPOINT"),
+            key="AZURE_OPENAI_ENDPOINT",
+            type="default",
+        )
+        azure_openai_api_key = st.text_input(
+            label="AZURE_OPENAI_API_KEY",
+            value=getenv("AZURE_OPENAI_API_KEY"),
+            key="AZURE_OPENAI_API_KEY",
+            type="password",
+        )
+        azure_openai_api_version = st.text_input(
+            label="AZURE_OPENAI_API_VERSION",
+            value=getenv("AZURE_OPENAI_API_VERSION"),
+            key="AZURE_OPENAI_API_VERSION",
+            type="default",
+        )
+        azure_openai_gpt_model = st.text_input(
+            label="AZURE_OPENAI_GPT_MODEL",
+            value=getenv("AZURE_OPENAI_GPT_MODEL"),
+            key="AZURE_OPENAI_GPT_MODEL",
+            type="default",
+        )
+        azure_ai_speech_api_subscription_key = st.text_input(
+            label="AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY",
+            value=getenv("AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY"),
+            key="AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY",
+            type="password",
+        )
+        azure_ai_speech_api_region = st.text_input(
+            label="AZURE_AI_SPEECH_API_REGION",
+            value=getenv("AZURE_AI_SPEECH_API_REGION"),
+            key="AZURE_AI_SPEECH_API_REGION",
+            type="default",
+        )
+    "[Azure Portal](https://portal.azure.com/)"
+    "[Azure OpenAI Studio](https://oai.azure.com/resource/overview)"
+    "[View the source code](https://github.com/ks6088ts-labs/workshop-azure-openai/blob/main/apps/14_streamlit_azure_ai_speech/main.py)"
+
+
+def is_configured():
+    if inference_type == "local":
+        return path_to_model and stt_host
+    if inference_type == "azure":
+        return azure_openai_api_key and azure_openai_endpoint and azure_openai_api_version and azure_openai_gpt_model
+
+
+st.title("transcribe text")
+
+if not is_configured():
+    st.warning("Please fill in the required fields at the sidebar.")
+
+st.info("This is a sample to transcribe text.")
+
+# ---
+# 2 column layout
+
+# 1st row
+row1_left, row1_right = st.columns(2)
+with row1_left:
+    input = st.text_area(
+        "Transcribed text",
+        height=400,
+        placeholder="Please enter the text to transcribe.",
+        key="input",
+        value=st.session_state["transcribed_result"],
+    )
+
+with row1_right:
+    start_transcribe_button = st.button("start", disabled=not is_configured())
+    stop_transcribe_button = st.button("stop", disabled=not is_configured())
+    transcription_status = st.empty()
+
+# line break horizontal line
+st.markdown("---")
+
+# 2nd row
+row2_left, row2_right = st.columns(2)
+
+with row2_left:
+    selected_task = st.selectbox(
+        "Task",
+        [
+            "Create summaries from the following text",
+            "Extract 3 main points from the following text",
+            # Add more tasks here
+        ],
+        key="selected_task",
+        index=0,
+    )
+
+with row2_right:
+    run_task_button = st.button("run_task", disabled=not is_configured())
+
+path_to_transcribed_text = ".transcribed.txt"
+
+
+def start_recognition():
+    global process
+    if inference_type == "local":
+        command = f"python apps/14_streamlit_azure_ai_speech/speech_to_text.py --output {path_to_transcribed_text} --endpoint {stt_host} --language {azure_ai_speech_api_language} --type local --verbose"  # noqa
+        process = subprocess.Popen(command, shell=True)
+        st.warning("Local inference is not yet implemented.")
+        return
+    if inference_type == "azure":
+        command = f"python apps/14_streamlit_azure_ai_speech/speech_to_text.py --output {path_to_transcribed_text} --subscription {azure_ai_speech_api_subscription_key} --region {azure_ai_speech_api_region} --language {azure_ai_speech_api_language} --type azure --verbose"  # noqa
+        process = subprocess.Popen(command, shell=True)
+
+
+def run_task(selected_task: str, input: str) -> str:
+    if inference_type == "local":
+        st.warning("Local inference is not yet implemented.")
+        return
+    if inference_type == "azure":
+        client = AzureOpenAI(
+            api_key=azure_openai_api_key,
+            api_version=azure_openai_api_version,
+            azure_endpoint=azure_openai_endpoint,
+        )
+
+        response = client.chat.completions.create(
+            model=azure_openai_gpt_model,
+            messages=[
+                {
+                    "role": "system",
+                    "content": f"""
+                        Task: {selected_task}.
+                        ---
+                        {input}
+                        ---
+                    """,
+                },
+            ],
+        )
+        return response.choices[0].message.content
+    raise ValueError(f"Inference type is not supported: {inference_type}")
+
+
+def load_transcribed_text():
+    with open(path_to_transcribed_text) as f:
+        return f.read()
+
+
+if start_transcribe_button:
+    if not st.session_state.get("process"):
+        transcription_status.info(f"Transcribing... (language={azure_ai_speech_api_language})")
+        start_recognition()
+    else:
+        transcription_status.warning("Transcription is already running.")
+
+if stop_transcribe_button:
+    pathlib.Path(".stop").touch()
+    output = load_transcribed_text()
+    st.session_state.transcribed_result = output
+    st.rerun()
+
+if run_task_button:
+    with st.spinner("Running..."):
+        output = run_task(
+            selected_task=selected_task,
+            input=input,
+        )
+        st.write(output)
diff --git a/apps/14_streamlit_azure_ai_speech/speech_to_text.py b/apps/14_streamlit_azure_ai_speech/speech_to_text.py
@@ -0,0 +1,157 @@
+import argparse
+import logging
+import os
+import time
+
+import azure.cognitiveservices.speech as speechsdk
+from dotenv import load_dotenv
+
+logger = logging.getLogger(__name__)
+
+
+outfilename = "output.txt"
+
+
+def init_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        prog="speech_to_text",
+        description="Azure AI Speech API Speech-to-Text",
+    )
+    parser.add_argument(
+        "-t",
+        "--type",
+        default="azure",
+        help="Inference type, either 'local' or 'azure'",
+    )
+    parser.add_argument(
+        "-e",
+        "--endpoint",
+        default="ws://localhost:5000",
+        help="Host address for local inference",
+    )
+    parser.add_argument(
+        "-s",
+        "--subscription",
+        default=os.getenv("AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY"),
+        help="Azure AI Speech API subscription key",
+    )
+    parser.add_argument(
+        "-r",
+        "--region",
+        default=os.getenv("AZURE_AI_SPEECH_API_REGION"),
+        help="Azure AI Speech API region",
+    )
+    parser.add_argument(
+        "-l",
+        "--language",
+        default="en-US",
+        help="Language code for speech recognition",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        default="output.txt",
+        help="Output file path",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Set verbose mode",
+    )
+    return parser.parse_args()
+
+
+def conversation_transcriber_recognition_canceled_cb(evt: speechsdk.SessionEventArgs):
+    logger.info("Canceled event")
+
+
+def conversation_transcriber_session_stopped_cb(evt: speechsdk.SessionEventArgs):
+    logger.info("SessionStopped event")
+
+
+def conversation_transcriber_transcribed_cb(evt: speechsdk.SpeechRecognitionEventArgs):
+    logger.info("TRANSCRIBED:")
+    if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
+        logger.info(f"\tText={evt.result.text}")
+        logger.info(f"\tSpeaker ID={evt.result.speaker_id}")
+        if evt.result.text != "":
+            with open(outfilename, "a") as f:
+                f.write(f"{evt.result.text}\n")
+    elif evt.result.reason == speechsdk.ResultReason.NoMatch:
+        logger.info(f"\tNOMATCH: Speech could not be TRANSCRIBED: {evt.result.no_match_details}")
+
+
+def conversation_transcriber_session_started_cb(evt: speechsdk.SessionEventArgs):
+    logger.info("SessionStarted event")
+
+
+def start_transcription(args: argparse.Namespace):
+    # FIXME: This is a workaround for setting the output file path
+    global outfilename
+    outfilename = args.output
+
+    speech_config = None
+    if args.type == "local":
+        speech_config = speechsdk.SpeechConfig(
+            host=args.endpoint,
+            speech_recognition_language=args.language,
+        )
+    if args.type == "azure":
+        speech_config = speechsdk.SpeechConfig(
+            subscription=args.subscription,
+            region=args.region,
+            speech_recognition_language=args.language,
+        )
+    if not speech_config:
+        raise ValueError(f"Invalid inference type: {args.type}")
+
+    conversation_transcriber = speechsdk.transcription.ConversationTranscriber(
+        speech_config=speech_config,
+    )
+
+    transcribing_stop = False
+
+    def stop_cb(evt: speechsdk.SessionEventArgs):
+        # """callback that signals to stop continuous recognition upon receiving an event `evt`"""
+        logger.info(f"CLOSING on {evt}")
+        nonlocal transcribing_stop
+        transcribing_stop = True
+
+    # Connect callbacks to the events fired by the conversation transcriber
+    conversation_transcriber.transcribed.connect(conversation_transcriber_transcribed_cb)
+    conversation_transcriber.session_started.connect(conversation_transcriber_session_started_cb)
+    conversation_transcriber.session_stopped.connect(conversation_transcriber_session_stopped_cb)
+    conversation_transcriber.canceled.connect(conversation_transcriber_recognition_canceled_cb)
+    # stop transcribing on either session stopped or canceled events
+    conversation_transcriber.session_stopped.connect(stop_cb)
+    conversation_transcriber.canceled.connect(stop_cb)
+
+    conversation_transcriber.start_transcribing_async()
+
+    # Waits for completion.
+    while not transcribing_stop:
+        if os.path.exists(".stop"):
+            logger.info("Stopping transcription...")
+            conversation_transcriber.stop_transcribing_async()
+            os.remove(".stop")
+            break
+        time.sleep(0.5)
+
+    conversation_transcriber.stop_transcribing_async()
+
+
+if __name__ == "__main__":
+    args = init_args()
+
+    # Set verbose mode
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+
+    # Parse .env file and set environment variables
+    load_dotenv()
+
+    try:
+        start_transcription(args=args)
+    except Exception as err:
+        logger.info(f"Encountered exception. {err}")
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml

-Original file line number
+Diff line change
 *.jpg
 *.jpeg
 .chroma
 +.stop
 +.transcribed.txt