Add comprehensive audio input/output with Azure OpenAI Whisper integration

Copilot · ks6088ts · Copilot · commit 7a7bc18aa82b · 2025-09-27T08:12:02.000Z
Co-authored-by: ks6088ts &lt;1254960+ks6088ts@users.noreply.github.com&gt;
diff --git a/template_langgraph/services/streamlits/pages/chat_with_tools_agent.py b/template_langgraph/services/streamlits/pages/chat_with_tools_agent.py
@@ -1,12 +1,15 @@
 from base64 import b64encode
 import tempfile
+from os import getenv
 
 import streamlit as st
 from audio_recorder_streamlit import audio_recorder
 from gtts import gTTS
 from langchain_community.callbacks.streamlit import (
     StreamlitCallbackHandler,
 )
+from langchain_community.document_loaders.parsers.audio import AzureOpenAIWhisperParser
+from langchain_core.documents.base import Blob
 
 from template_langgraph.agents.chat_with_tools_agent.agent import (
     AgentState,
@@ -38,6 +41,33 @@ def image_to_base64(image_bytes: bytes) -> str:
     )
     st.session_state["input_output_mode"] = input_output_mode
     
+    # 音声モードの場合、Azure OpenAI設定を表示
+    if input_output_mode == "音声":
+        st.subheader("音声認識設定 (オプション)")
+        with st.expander("Azure OpenAI Whisper設定", expanded=False):
+            azure_openai_endpoint = st.text_input(
+                "AZURE_OPENAI_ENDPOINT",
+                value=getenv("AZURE_OPENAI_ENDPOINT", ""),
+                help="Azure OpenAI リソースのエンドポイント"
+            )
+            azure_openai_api_key = st.text_input(
+                "AZURE_OPENAI_API_KEY",
+                value=getenv("AZURE_OPENAI_API_KEY", ""),
+                type="password",
+                help="Azure OpenAI リソースのAPIキー"
+            )
+            azure_openai_api_version = st.text_input(
+                "AZURE_OPENAI_API_VERSION", 
+                value=getenv("AZURE_OPENAI_API_VERSION", "2024-02-01"),
+                help="Azure OpenAI APIバージョン"
+            )
+            azure_openai_model_stt = st.text_input(
+                "AZURE_OPENAI_MODEL_STT",
+                value=getenv("AZURE_OPENAI_MODEL_STT", "whisper"),
+                help="音声認識用のデプロイ名"
+            )
+            st.caption("※設定しない場合は、音声入力時にプレースホルダーテキストが使用されます")
+    
     st.divider()
     st.subheader("使用するツール")
 
@@ -99,19 +129,51 @@ def image_to_base64(image_bytes: bytes) -> str:
     
     if audio_bytes:
         st.audio(audio_bytes, format="audio/wav")
+        
         # 音声データを一時ファイルに保存
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
             temp_audio_file.write(audio_bytes)
             temp_audio_file_path = temp_audio_file.name
         
-        # TODO: 音声からテキストへの変換実装
-        # 現在は音声入力をプレースホルダーテキストに変換
-        prompt_text = "音声入力を受信しました（音声認識は後で実装予定）"
-        prompt = prompt_text
-        
-        # 一時ファイルを削除
-        import os
-        os.unlink(temp_audio_file_path)
+        # Azure OpenAI Whisperが設定されている場合は音声認識を実施
+        try:
+            if (input_output_mode == "音声" and 
+                azure_openai_endpoint and azure_openai_api_key and 
+                azure_openai_model_stt):
+                
+                with st.spinner("音声を認識中..."):
+                    audio_blob = Blob(path=temp_audio_file_path)
+                    parser = AzureOpenAIWhisperParser(
+                        api_key=azure_openai_api_key,
+                        azure_endpoint=azure_openai_endpoint,
+                        api_version=azure_openai_api_version,
+                        deployment_name=azure_openai_model_stt,
+                    )
+                    documents = parser.lazy_parse(blob=audio_blob)
+                    results = [doc.page_content for doc in documents]
+                    prompt_text = "\n".join(results).strip()
+                    
+                    if prompt_text:
+                        st.success(f"音声認識完了: {prompt_text}")
+                        prompt = prompt_text
+                    else:
+                        st.warning("音声が認識できませんでした")
+                        prompt = None
+            else:
+                # Azure OpenAI設定がない場合はプレースホルダー
+                prompt_text = "音声入力を受信しました（音声認識設定が必要です）"
+                prompt = prompt_text
+                st.info("音声認識を使用するには、サイドバーでAzure OpenAI設定を入力してください")
+                
+        except Exception as e:
+            st.error(f"音声認識でエラーが発生しました: {e}")
+            prompt_text = "音声入力でエラーが発生しました"
+            prompt = prompt_text
+        finally:
+            # 一時ファイルを削除
+            import os
+            if os.path.exists(temp_audio_file_path):
+                os.unlink(temp_audio_file_path)
         
 else:
     # 既存のテキスト入力モード