Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,14 @@ dependencies = [
"langgraph>=0.6.2",
"langgraph-supervisor>=0.0.29",
"mlflow>=3.4.0",
"openai-whisper>=20250625",
"openai[realtime]>=1.98.0",
"opentelemetry-api>=1.36.0",
"opentelemetry-exporter-otlp>=1.36.0",
"opentelemetry-sdk>=1.36.0",
"psycopg2-binary>=2.9.10",
"pydantic-settings>=2.9.1",
"pydub>=0.25.1",
"pypdf>=5.9.0",
"python-dotenv>=1.1.0",
"qdrant-client>=1.15.1",
Expand Down
234 changes: 222 additions & 12 deletions template_langgraph/services/streamlits/pages/chat_with_tools_agent.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
import io
import os
import tempfile
from base64 import b64encode
from datetime import datetime

import streamlit as st
import whisper
from audio_recorder_streamlit import audio_recorder
from gtts import gTTS
from langchain_community.callbacks.streamlit import (
StreamlitCallbackHandler,
)
from pydub import AudioSegment
from pydub.effects import speedup

from template_langgraph.agents.chat_with_tools_agent.agent import (
AgentState,
Expand All @@ -16,11 +25,147 @@ def image_to_base64(image_bytes: bytes) -> str:
return b64encode(image_bytes).decode("utf-8")


@st.cache_resource(show_spinner=False)
def load_whisper_model(model_size: str = "base"):
"""Load a Whisper model only once per session."""

return whisper.load_model(model_size)


def synthesize_audio(
text: str,
language: str = "ja",
speed: float = 1.0,
pitch_shift: int = 0,
volume_db: float = 0.0,
) -> bytes | None:
"""Convert text to speech audio using gTTS and pydub adjustments."""

if not text.strip():
return None

try:
tts = gTTS(text=text, lang=language)
mp3_buffer = io.BytesIO()
tts.write_to_fp(mp3_buffer)
mp3_buffer.seek(0)

audio_segment = AudioSegment.from_file(mp3_buffer, format="mp3")
original_rate = audio_segment.frame_rate

if pitch_shift != 0:
semitone_ratio = 2.0 ** (pitch_shift / 12.0)
shifted = audio_segment._spawn(
audio_segment.raw_data,
overrides={"frame_rate": int(original_rate * semitone_ratio)},
)
audio_segment = shifted.set_frame_rate(original_rate)

if speed != 1.0:
if speed > 1.0:
audio_segment = speedup(audio_segment, playback_speed=float(speed))
else:
slowed_rate = max(int(original_rate * float(speed)), 1)
audio_segment = audio_segment._spawn(
audio_segment.raw_data,
overrides={"frame_rate": slowed_rate},
).set_frame_rate(original_rate)

if volume_db != 0:
audio_segment += float(volume_db)

output_buffer = io.BytesIO()
audio_segment.export(output_buffer, format="mp3")
return output_buffer.getvalue()
except Exception as exc: # pragma: no cover
st.error(f"音声合成に失敗しました: {exc}")
return None


if "chat_history" not in st.session_state:
st.session_state["chat_history"] = []

# Sidebar: ツール選択とエージェントの構築
# Sidebar: 入出力モード選択、ツール選択とエージェントの構築
with st.sidebar:
st.subheader("入出力モード")

# 入出力モード選択
if "input_output_mode" not in st.session_state:
st.session_state["input_output_mode"] = "テキスト"

input_output_mode = st.radio(
"モードを選択してください",
options=["テキスト", "音声"],
index=0 if st.session_state["input_output_mode"] == "テキスト" else 1,
help="テキスト: 従来のテキスト入力/出力, 音声: マイク入力/音声出力",
)
st.session_state["input_output_mode"] = input_output_mode

# 音声モードの場合、Whisper 設定を表示
if input_output_mode == "音声":
st.subheader("音声認識設定 (オプション)")
audio_bytes = audio_recorder(
text="クリックして音声入力👉️",
recording_color="red",
neutral_color="gray",
icon_name="microphone",
icon_size="2x",
key="audio_input",
)
selected_model = st.sidebar.selectbox(
"Whisperモデル",
[
"tiny",
"base",
"small",
"medium",
"large",
],
index=1,
)
transcription_language = st.sidebar.selectbox(
"文字起こし言語",
[
"auto",
"ja",
"en",
],
index=0,
help="autoは言語自動判定です",
)
tts_language = st.sidebar.selectbox(
"TTS言語",
[
"ja",
"en",
"fr",
"de",
"ko",
"zh-CN",
],
index=0,
)
tts_speed = st.sidebar.slider(
"再生速度",
min_value=0.5,
max_value=2.0,
step=0.1,
value=1.0,
)
tts_pitch = st.sidebar.slider(
"ピッチ (半音)",
min_value=-12,
max_value=12,
value=0,
)
tts_volume = st.sidebar.slider(
"音量 (dB)",
min_value=-20,
max_value=10,
value=0,
)

st.divider()
st.subheader("使用するツール")

# 利用可能なツール一覧を取得
Expand Down Expand Up @@ -63,16 +208,63 @@ def image_to_base64(image_bytes: bytes) -> str:
else:
st.chat_message("assistant").write(msg.content)

if prompt := st.chat_input(
accept_file="multiple",
file_type=[
"png",
"jpg",
"jpeg",
"gif",
"webp",
],
):
# 入力セクション: モードに応じて分岐
prompt = None
prompt_text = ""
prompt_files = []

if input_output_mode == "音声":
if audio_bytes:
st.audio(audio_bytes, format="audio/wav")

# 音声データを一時ファイルに保存
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
temp_audio_file.write(audio_bytes)
temp_audio_file_path = temp_audio_file.name
st.download_button(
label="🎧 録音データを保存",
data=audio_bytes,
file_name=f"recorded_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav",
mime="audio/wav",
use_container_width=True,
)
try:
if input_output_mode == "音声":
with st.spinner("音声を認識中..."):
model = load_whisper_model(selected_model)
language_param = None if transcription_language == "auto" else transcription_language
result = model.transcribe(str(temp_audio_file_path), language=language_param)
transcribed_text = result.get("text", "").strip()
prompt_text = transcribed_text

if prompt_text:
st.success(f"音声認識完了: {prompt_text}")
prompt = prompt_text
else:
st.warning("音声が認識できませんでした")
except Exception as e:
st.error(f"音声認識でエラーが発生しました: {e}")
prompt_text = "音声入力でエラーが発生しました"
finally:
if os.path.exists(temp_audio_file_path):
os.unlink(temp_audio_file_path)

else:
# 既存のテキスト入力モード
if prompt := st.chat_input(
accept_file="multiple",
file_type=[
"png",
"jpg",
"jpeg",
"gif",
"webp",
],
):
pass # promptは既に設定済み

# 共通の入力処理ロジック
if prompt:
user_display_items = []
message_parts = []

Expand Down Expand Up @@ -141,4 +333,22 @@ def image_to_base64(image_bytes: bytes) -> str:
)
last_message = response["messages"][-1]
st.session_state["chat_history"].append(last_message)
st.write(last_message.content)

# レスポンス表示とオーディオ出力
response_content = last_message.content
st.write(response_content)

# 音声モードの場合、音声出力を追加
if input_output_mode == "音声":
try:
with st.spinner("音声を生成中です..."):
audio_bytes = synthesize_audio(
text=response_content,
language=tts_language,
speed=tts_speed,
pitch_shift=tts_pitch,
volume_db=tts_volume,
)
st.audio(audio_bytes, format="audio/mp3", autoplay=True)
except Exception as e:
st.warning(f"音声出力でエラーが発生しました: {e}")
Loading
Loading