44from datetime import datetime
55
66import streamlit as st
7- import whisper
87from audio_recorder_streamlit import audio_recorder
98from langchain_community .callbacks .streamlit import (
109 StreamlitCallbackHandler ,
1413 AgentState ,
1514 ChatWithToolsAgent ,
1615)
16+ from template_langgraph .speeches .stt import SttWrapper
1717from template_langgraph .speeches .tts import synthesize_audio
1818from template_langgraph .tools .common import get_default_tools
1919
@@ -23,10 +23,11 @@ def image_to_base64(image_bytes: bytes) -> str:
2323
2424
2525@st .cache_resource (show_spinner = False )
26- def load_whisper_model (model_size : str = "base" ):
27- """Load a Whisper model only once per session."""
28-
29- return whisper .load_model (model_size )
26+ def load_stt_wrapper (model_size : str = "base" ):
27+ """Load and cache the STT model."""
28+ stt_wrapper = SttWrapper ()
29+ stt_wrapper .load_model (model_size )
30+ return stt_wrapper
3031
3132
3233if "chat_history" not in st .session_state :
@@ -178,9 +179,9 @@ def load_whisper_model(model_size: str = "base"):
178179 try :
179180 if input_output_mode == "音声" :
180181 with st .spinner ("音声を認識中..." ):
181- model = load_whisper_model (selected_model )
182+ stt_wrapper = load_stt_wrapper (selected_model )
182183 language_param = None if transcription_language == "auto" else transcription_language
183- result = model .transcribe (str (temp_audio_file_path ), language = language_param )
184+ result = stt_wrapper .transcribe (str (temp_audio_file_path ), language = language_param )
184185 transcribed_text = result .get ("text" , "" ).strip ()
185186 prompt_text = transcribed_text
186187
0 commit comments