1
- from base64 import b64encode
1
+ import os
2
2
import tempfile
3
- from os import getenv
3
+ from base64 import b64encode
4
+ from datetime import datetime
4
5
5
6
import streamlit as st
7
+ import whisper
6
8
from audio_recorder_streamlit import audio_recorder
7
9
from gtts import gTTS
8
10
from langchain_community .callbacks .streamlit import (
9
11
StreamlitCallbackHandler ,
10
12
)
11
- from langchain_community .document_loaders .parsers .audio import AzureOpenAIWhisperParser
12
- from langchain_core .documents .base import Blob
13
13
14
14
from template_langgraph .agents .chat_with_tools_agent .agent import (
15
15
AgentState ,
@@ -22,52 +22,64 @@ def image_to_base64(image_bytes: bytes) -> str:
22
22
return b64encode (image_bytes ).decode ("utf-8" )
23
23
24
24
25
+ @st .cache_resource (show_spinner = False )
26
+ def load_whisper_model (model_size : str = "base" ):
27
+ """Load a Whisper model only once per session."""
28
+
29
+ return whisper .load_model (model_size )
30
+
31
+
25
32
if "chat_history" not in st .session_state :
26
33
st .session_state ["chat_history" ] = []
27
34
28
35
# Sidebar: 入出力モード選択、ツール選択とエージェントの構築
29
36
with st .sidebar :
30
37
st .subheader ("入出力モード" )
31
-
38
+
32
39
# 入出力モード選択
33
40
if "input_output_mode" not in st .session_state :
34
41
st .session_state ["input_output_mode" ] = "テキスト"
35
-
42
+
36
43
input_output_mode = st .radio (
37
44
"モードを選択してください" ,
38
45
options = ["テキスト" , "音声" ],
39
46
index = 0 if st .session_state ["input_output_mode" ] == "テキスト" else 1 ,
40
- help = "テキスト: 従来のテキスト入力/出力, 音声: マイク入力/音声出力"
47
+ help = "テキスト: 従来のテキスト入力/出力, 音声: マイク入力/音声出力" ,
41
48
)
42
49
st .session_state ["input_output_mode" ] = input_output_mode
43
-
44
- # 音声モードの場合、Azure OpenAI設定を表示
50
+
51
+ # 音声モードの場合、Whisper 設定を表示
45
52
if input_output_mode == "音声" :
46
53
st .subheader ("音声認識設定 (オプション)" )
47
- with st .expander ("Azure OpenAI Whisper設定" , expanded = False ):
48
- azure_openai_endpoint = st .text_input (
49
- "AZURE_OPENAI_ENDPOINT" ,
50
- value = getenv ("AZURE_OPENAI_ENDPOINT" , "" ),
51
- help = "Azure OpenAI リソースのエンドポイント"
54
+ with st .expander ("Whisper設定" , expanded = False ):
55
+ selected_model = st .sidebar .selectbox (
56
+ "Whisperモデル" ,
57
+ [
58
+ "tiny" ,
59
+ "base" ,
60
+ "small" ,
61
+ "medium" ,
62
+ "large" ,
63
+ ],
64
+ index = 1 ,
52
65
)
53
- azure_openai_api_key = st .text_input (
54
- "AZURE_OPENAI_API_KEY" ,
55
- value = getenv ("AZURE_OPENAI_API_KEY" , "" ),
56
- type = "password" ,
57
- help = "Azure OpenAI リソースのAPIキー"
66
+ transcription_language = st .sidebar .selectbox (
67
+ "文字起こし言語" ,
68
+ [
69
+ "auto" ,
70
+ "ja" ,
71
+ "en" ,
72
+ ],
73
+ index = 0 ,
74
+ help = "autoは言語自動判定です" ,
58
75
)
59
- azure_openai_api_version = st .text_input (
60
- "AZURE_OPENAI_API_VERSION" ,
61
- value = getenv ("AZURE_OPENAI_API_VERSION" , "2024-02-01" ),
62
- help = "Azure OpenAI APIバージョン"
76
+ st .markdown (
77
+ """
78
+ - Whisperモデルは大きいほど高精度ですが、処理に時間がかかります。
79
+ - 文字起こし言語を指定することで、認識精度が向上します。
80
+ """
63
81
)
64
- azure_openai_model_stt = st .text_input (
65
- "AZURE_OPENAI_MODEL_STT" ,
66
- value = getenv ("AZURE_OPENAI_MODEL_STT" , "whisper" ),
67
- help = "音声認識用のデプロイ名"
68
- )
69
- st .caption ("※設定しない場合は、音声入力時にプレースホルダーテキストが使用されます" )
70
-
82
+
71
83
st .divider ()
72
84
st .subheader ("使用するツール" )
73
85
@@ -121,60 +133,47 @@ def image_to_base64(image_bytes: bytes) -> str:
121
133
audio_bytes = audio_recorder (
122
134
text = "クリックして録音" ,
123
135
recording_color = "red" ,
124
- neutral_color = "black " ,
136
+ neutral_color = "gray " ,
125
137
icon_name = "microphone" ,
126
138
icon_size = "2x" ,
127
- key = "audio_input"
139
+ key = "audio_input" ,
128
140
)
129
-
141
+
130
142
if audio_bytes :
131
143
st .audio (audio_bytes , format = "audio/wav" )
132
-
144
+
133
145
# 音声データを一時ファイルに保存
134
146
with tempfile .NamedTemporaryFile (suffix = ".wav" , delete = False ) as temp_audio_file :
135
147
temp_audio_file .write (audio_bytes )
136
148
temp_audio_file_path = temp_audio_file .name
137
-
138
- # Azure OpenAI Whisperが設定されている場合は音声認識を実施
149
+ st .download_button (
150
+ label = "🎧 録音データを保存" ,
151
+ data = audio_bytes ,
152
+ file_name = f"recorded_{ datetime .now ().strftime ('%Y%m%d_%H%M%S' )} .wav" ,
153
+ mime = "audio/wav" ,
154
+ use_container_width = True ,
155
+ )
139
156
try :
140
- if (input_output_mode == "音声" and
141
- azure_openai_endpoint and azure_openai_api_key and
142
- azure_openai_model_stt ):
143
-
157
+ if input_output_mode == "音声" :
144
158
with st .spinner ("音声を認識中..." ):
145
- audio_blob = Blob (path = temp_audio_file_path )
146
- parser = AzureOpenAIWhisperParser (
147
- api_key = azure_openai_api_key ,
148
- azure_endpoint = azure_openai_endpoint ,
149
- api_version = azure_openai_api_version ,
150
- deployment_name = azure_openai_model_stt ,
151
- )
152
- documents = parser .lazy_parse (blob = audio_blob )
153
- results = [doc .page_content for doc in documents ]
154
- prompt_text = "\n " .join (results ).strip ()
155
-
159
+ model = load_whisper_model (selected_model )
160
+ language_param = None if transcription_language == "auto" else transcription_language
161
+ result = model .transcribe (str (temp_audio_file_path ), language = language_param )
162
+ transcribed_text = result .get ("text" , "" ).strip ()
163
+ prompt_text = transcribed_text
164
+
156
165
if prompt_text :
157
166
st .success (f"音声認識完了: { prompt_text } " )
158
167
prompt = prompt_text
159
168
else :
160
169
st .warning ("音声が認識できませんでした" )
161
- prompt = None
162
- else :
163
- # Azure OpenAI設定がない場合はプレースホルダー
164
- prompt_text = "音声入力を受信しました(音声認識設定が必要です)"
165
- prompt = prompt_text
166
- st .info ("音声認識を使用するには、サイドバーでAzure OpenAI設定を入力してください" )
167
-
168
170
except Exception as e :
169
171
st .error (f"音声認識でエラーが発生しました: { e } " )
170
172
prompt_text = "音声入力でエラーが発生しました"
171
- prompt = prompt_text
172
173
finally :
173
- # 一時ファイルを削除
174
- import os
175
174
if os .path .exists (temp_audio_file_path ):
176
175
os .unlink (temp_audio_file_path )
177
-
176
+
178
177
else :
179
178
# 既存のテキスト入力モード
180
179
if prompt := st .chat_input (
@@ -259,27 +258,24 @@ def image_to_base64(image_bytes: bytes) -> str:
259
258
)
260
259
last_message = response ["messages" ][- 1 ]
261
260
st .session_state ["chat_history" ].append (last_message )
262
-
261
+
263
262
# レスポンス表示とオーディオ出力
264
263
response_content = last_message .content
265
264
st .write (response_content )
266
-
265
+
267
266
# 音声モードの場合、音声出力を追加
268
267
if input_output_mode == "音声" :
269
268
try :
270
269
# gTTSを使って音声生成
271
- tts = gTTS (text = response_content , lang = 'ja' )
270
+ tts = gTTS (text = response_content , lang = "ja" )
272
271
with tempfile .NamedTemporaryFile (suffix = ".mp3" , delete = False ) as temp_audio_file :
273
272
tts .save (temp_audio_file .name )
274
-
273
+
275
274
# 音声ファイルを読み込んでstreamlit audio widgetで再生
276
275
with open (temp_audio_file .name , "rb" ) as audio_file :
277
276
audio_bytes = audio_file .read ()
278
277
st .audio (audio_bytes , format = "audio/mp3" , autoplay = True )
279
-
280
- # 一時ファイルを削除
281
- import os
282
278
os .unlink (temp_audio_file .name )
283
-
279
+
284
280
except Exception as e :
285
281
st .warning (f"音声出力でエラーが発生しました: { e } " )
0 commit comments