4
4
from datetime import datetime
5
5
6
6
import streamlit as st
7
- import whisper
8
7
from audio_recorder_streamlit import audio_recorder
9
8
from langchain_community .callbacks .streamlit import (
10
9
StreamlitCallbackHandler ,
14
13
AgentState ,
15
14
ChatWithToolsAgent ,
16
15
)
16
+ from template_langgraph .speeches .stt import SttWrapper
17
17
from template_langgraph .speeches .tts import synthesize_audio
18
18
from template_langgraph .tools .common import get_default_tools
19
19
@@ -23,10 +23,11 @@ def image_to_base64(image_bytes: bytes) -> str:
23
23
24
24
25
25
@st .cache_resource (show_spinner = False )
26
- def load_whisper_model (model_size : str = "base" ):
27
- """Load a Whisper model only once per session."""
28
-
29
- return whisper .load_model (model_size )
26
+ def load_stt_wrapper (model_size : str = "base" ):
27
+ """Load and cache the STT model."""
28
+ stt_wrapper = SttWrapper ()
29
+ stt_wrapper .load_model (model_size )
30
+ return stt_wrapper
30
31
31
32
32
33
if "chat_history" not in st .session_state :
@@ -178,9 +179,9 @@ def load_whisper_model(model_size: str = "base"):
178
179
try :
179
180
if input_output_mode == "音声" :
180
181
with st .spinner ("音声を認識中..." ):
181
- model = load_whisper_model (selected_model )
182
+ stt_wrapper = load_stt_wrapper (selected_model )
182
183
language_param = None if transcription_language == "auto" else transcription_language
183
- result = model .transcribe (str (temp_audio_file_path ), language = language_param )
184
+ result = stt_wrapper .transcribe (str (temp_audio_file_path ), language = language_param )
184
185
transcribed_text = result .get ("text" , "" ).strip ()
185
186
prompt_text = transcribed_text
186
187
0 commit comments