1
+ import io
1
2
import os
2
3
import tempfile
3
4
from base64 import b64encode
10
11
from langchain_community .callbacks .streamlit import (
11
12
StreamlitCallbackHandler ,
12
13
)
14
+ from pydub import AudioSegment
15
+ from pydub .effects import speedup
13
16
14
17
from template_langgraph .agents .chat_with_tools_agent .agent import (
15
18
AgentState ,
@@ -29,6 +32,56 @@ def load_whisper_model(model_size: str = "base"):
29
32
return whisper .load_model (model_size )
30
33
31
34
35
+ def synthesize_audio (
36
+ text : str ,
37
+ language : str = "ja" ,
38
+ speed : float = 1.0 ,
39
+ pitch_shift : int = 0 ,
40
+ volume_db : float = 0.0 ,
41
+ ) -> bytes | None :
42
+ """Convert text to speech audio using gTTS and pydub adjustments."""
43
+
44
+ if not text .strip ():
45
+ return None
46
+
47
+ try :
48
+ tts = gTTS (text = text , lang = language )
49
+ mp3_buffer = io .BytesIO ()
50
+ tts .write_to_fp (mp3_buffer )
51
+ mp3_buffer .seek (0 )
52
+
53
+ audio_segment = AudioSegment .from_file (mp3_buffer , format = "mp3" )
54
+ original_rate = audio_segment .frame_rate
55
+
56
+ if pitch_shift != 0 :
57
+ semitone_ratio = 2.0 ** (pitch_shift / 12.0 )
58
+ shifted = audio_segment ._spawn (
59
+ audio_segment .raw_data ,
60
+ overrides = {"frame_rate" : int (original_rate * semitone_ratio )},
61
+ )
62
+ audio_segment = shifted .set_frame_rate (original_rate )
63
+
64
+ if speed != 1.0 :
65
+ if speed > 1.0 :
66
+ audio_segment = speedup (audio_segment , playback_speed = float (speed ))
67
+ else :
68
+ slowed_rate = max (int (original_rate * float (speed )), 1 )
69
+ audio_segment = audio_segment ._spawn (
70
+ audio_segment .raw_data ,
71
+ overrides = {"frame_rate" : slowed_rate },
72
+ ).set_frame_rate (original_rate )
73
+
74
+ if volume_db != 0 :
75
+ audio_segment += float (volume_db )
76
+
77
+ output_buffer = io .BytesIO ()
78
+ audio_segment .export (output_buffer , format = "mp3" )
79
+ return output_buffer .getvalue ()
80
+ except Exception as exc : # pragma: no cover
81
+ st .error (f"音声合成に失敗しました: { exc } " )
82
+ return None
83
+
84
+
32
85
if "chat_history" not in st .session_state :
33
86
st .session_state ["chat_history" ] = []
34
87
@@ -51,34 +104,58 @@ def load_whisper_model(model_size: str = "base"):
51
104
# 音声モードの場合、Whisper 設定を表示
52
105
if input_output_mode == "音声" :
53
106
st .subheader ("音声認識設定 (オプション)" )
54
- with st .expander ("Whisper設定" , expanded = False ):
55
- selected_model = st .sidebar .selectbox (
56
- "Whisperモデル" ,
57
- [
58
- "tiny" ,
59
- "base" ,
60
- "small" ,
61
- "medium" ,
62
- "large" ,
63
- ],
64
- index = 1 ,
65
- )
66
- transcription_language = st .sidebar .selectbox (
67
- "文字起こし言語" ,
68
- [
69
- "auto" ,
70
- "ja" ,
71
- "en" ,
72
- ],
73
- index = 0 ,
74
- help = "autoは言語自動判定です" ,
75
- )
76
- st .markdown (
77
- """
78
- - Whisperモデルは大きいほど高精度ですが、処理に時間がかかります。
79
- - 文字起こし言語を指定することで、認識精度が向上します。
80
- """
81
- )
107
+ selected_model = st .sidebar .selectbox (
108
+ "Whisperモデル" ,
109
+ [
110
+ "tiny" ,
111
+ "base" ,
112
+ "small" ,
113
+ "medium" ,
114
+ "large" ,
115
+ ],
116
+ index = 1 ,
117
+ )
118
+ transcription_language = st .sidebar .selectbox (
119
+ "文字起こし言語" ,
120
+ [
121
+ "auto" ,
122
+ "ja" ,
123
+ "en" ,
124
+ ],
125
+ index = 0 ,
126
+ help = "autoは言語自動判定です" ,
127
+ )
128
+ tts_language = st .sidebar .selectbox (
129
+ "TTS言語" ,
130
+ [
131
+ "ja" ,
132
+ "en" ,
133
+ "fr" ,
134
+ "de" ,
135
+ "ko" ,
136
+ "zh-CN" ,
137
+ ],
138
+ index = 0 ,
139
+ )
140
+ tts_speed = st .sidebar .slider (
141
+ "再生速度" ,
142
+ min_value = 0.5 ,
143
+ max_value = 2.0 ,
144
+ step = 0.1 ,
145
+ value = 1.0 ,
146
+ )
147
+ tts_pitch = st .sidebar .slider (
148
+ "ピッチ (半音)" ,
149
+ min_value = - 12 ,
150
+ max_value = 12 ,
151
+ value = 0 ,
152
+ )
153
+ tts_volume = st .sidebar .slider (
154
+ "音量 (dB)" ,
155
+ min_value = - 20 ,
156
+ max_value = 10 ,
157
+ value = 0 ,
158
+ )
82
159
83
160
st .divider ()
84
161
st .subheader ("使用するツール" )
@@ -266,16 +343,14 @@ def load_whisper_model(model_size: str = "base"):
266
343
# 音声モードの場合、音声出力を追加
267
344
if input_output_mode == "音声" :
268
345
try :
269
- # gTTSを使って音声生成
270
- tts = gTTS (text = response_content , lang = "ja" )
271
- with tempfile .NamedTemporaryFile (suffix = ".mp3" , delete = False ) as temp_audio_file :
272
- tts .save (temp_audio_file .name )
273
-
274
- # 音声ファイルを読み込んでstreamlit audio widgetで再生
275
- with open (temp_audio_file .name , "rb" ) as audio_file :
276
- audio_bytes = audio_file .read ()
277
- st .audio (audio_bytes , format = "audio/mp3" , autoplay = True )
278
- os .unlink (temp_audio_file .name )
279
-
346
+ with st .spinner ("音声を生成中です..." ):
347
+ audio_bytes = synthesize_audio (
348
+ text = response_content ,
349
+ language = tts_language ,
350
+ speed = tts_speed ,
351
+ pitch_shift = tts_pitch ,
352
+ volume_db = tts_volume ,
353
+ )
354
+ st .audio (audio_bytes , format = "audio/mp3" , autoplay = True )
280
355
except Exception as e :
281
356
st .warning (f"音声出力でエラーが発生しました: { e } " )
0 commit comments