1
- import io
2
1
import os
3
2
import tempfile
4
3
from base64 import b64encode
7
6
import streamlit as st
8
7
import whisper
9
8
from audio_recorder_streamlit import audio_recorder
10
- from gtts import gTTS
11
9
from langchain_community .callbacks .streamlit import (
12
10
StreamlitCallbackHandler ,
13
11
)
14
- from pydub import AudioSegment
15
- from pydub .effects import speedup
16
12
17
13
from template_langgraph .agents .chat_with_tools_agent .agent import (
18
14
AgentState ,
19
15
ChatWithToolsAgent ,
20
16
)
17
+ from template_langgraph .speeches .tts import synthesize_audio
21
18
from template_langgraph .tools .common import get_default_tools
22
19
23
20
@@ -32,56 +29,6 @@ def load_whisper_model(model_size: str = "base"):
32
29
return whisper .load_model (model_size )
33
30
34
31
35
- def synthesize_audio (
36
- text : str ,
37
- language : str = "ja" ,
38
- speed : float = 1.0 ,
39
- pitch_shift : int = 0 ,
40
- volume_db : float = 0.0 ,
41
- ) -> bytes | None :
42
- """Convert text to speech audio using gTTS and pydub adjustments."""
43
-
44
- if not text .strip ():
45
- return None
46
-
47
- try :
48
- tts = gTTS (text = text , lang = language )
49
- mp3_buffer = io .BytesIO ()
50
- tts .write_to_fp (mp3_buffer )
51
- mp3_buffer .seek (0 )
52
-
53
- audio_segment = AudioSegment .from_file (mp3_buffer , format = "mp3" )
54
- original_rate = audio_segment .frame_rate
55
-
56
- if pitch_shift != 0 :
57
- semitone_ratio = 2.0 ** (pitch_shift / 12.0 )
58
- shifted = audio_segment ._spawn (
59
- audio_segment .raw_data ,
60
- overrides = {"frame_rate" : int (original_rate * semitone_ratio )},
61
- )
62
- audio_segment = shifted .set_frame_rate (original_rate )
63
-
64
- if speed != 1.0 :
65
- if speed > 1.0 :
66
- audio_segment = speedup (audio_segment , playback_speed = float (speed ))
67
- else :
68
- slowed_rate = max (int (original_rate * float (speed )), 1 )
69
- audio_segment = audio_segment ._spawn (
70
- audio_segment .raw_data ,
71
- overrides = {"frame_rate" : slowed_rate },
72
- ).set_frame_rate (original_rate )
73
-
74
- if volume_db != 0 :
75
- audio_segment += float (volume_db )
76
-
77
- output_buffer = io .BytesIO ()
78
- audio_segment .export (output_buffer , format = "mp3" )
79
- return output_buffer .getvalue ()
80
- except Exception as exc : # pragma: no cover
81
- st .error (f"音声合成に失敗しました: { exc } " )
82
- return None
83
-
84
-
85
32
if "chat_history" not in st .session_state :
86
33
st .session_state ["chat_history" ] = []
87
34
0 commit comments