audio 出力設定を追加

ks6088ts · ks6088ts · commit f69d58375cc5 · 2025-09-27T23:03:11.000+09:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,6 +34,7 @@ dependencies = [
     "opentelemetry-sdk>=1.36.0",
     "psycopg2-binary>=2.9.10",
     "pydantic-settings>=2.9.1",
+    "pydub>=0.25.1",
     "pypdf>=5.9.0",
     "python-dotenv>=1.1.0",
     "qdrant-client>=1.15.1",
diff --git a/template_langgraph/services/streamlits/pages/chat_with_tools_agent.py b/template_langgraph/services/streamlits/pages/chat_with_tools_agent.py
@@ -1,3 +1,4 @@
+import io
 import os
 import tempfile
 from base64 import b64encode
@@ -10,6 +11,8 @@
 from langchain_community.callbacks.streamlit import (
     StreamlitCallbackHandler,
 )
+from pydub import AudioSegment
+from pydub.effects import speedup
 
 from template_langgraph.agents.chat_with_tools_agent.agent import (
     AgentState,
@@ -29,6 +32,56 @@ def load_whisper_model(model_size: str = "base"):
     return whisper.load_model(model_size)
 
 
+def synthesize_audio(
+    text: str,
+    language: str = "ja",
+    speed: float = 1.0,
+    pitch_shift: int = 0,
+    volume_db: float = 0.0,
+) -> bytes | None:
+    """Convert text to speech audio using gTTS and pydub adjustments."""
+
+    if not text.strip():
+        return None
+
+    try:
+        tts = gTTS(text=text, lang=language)
+        mp3_buffer = io.BytesIO()
+        tts.write_to_fp(mp3_buffer)
+        mp3_buffer.seek(0)
+
+        audio_segment = AudioSegment.from_file(mp3_buffer, format="mp3")
+        original_rate = audio_segment.frame_rate
+
+        if pitch_shift != 0:
+            semitone_ratio = 2.0 ** (pitch_shift / 12.0)
+            shifted = audio_segment._spawn(
+                audio_segment.raw_data,
+                overrides={"frame_rate": int(original_rate * semitone_ratio)},
+            )
+            audio_segment = shifted.set_frame_rate(original_rate)
+
+        if speed != 1.0:
+            if speed > 1.0:
+                audio_segment = speedup(audio_segment, playback_speed=float(speed))
+            else:
+                slowed_rate = max(int(original_rate * float(speed)), 1)
+                audio_segment = audio_segment._spawn(
+                    audio_segment.raw_data,
+                    overrides={"frame_rate": slowed_rate},
+                ).set_frame_rate(original_rate)
+
+        if volume_db != 0:
+            audio_segment += float(volume_db)
+
+        output_buffer = io.BytesIO()
+        audio_segment.export(output_buffer, format="mp3")
+        return output_buffer.getvalue()
+    except Exception as exc:  # pragma: no cover
+        st.error(f"音声合成に失敗しました: {exc}")
+        return None
+
+
 if "chat_history" not in st.session_state:
     st.session_state["chat_history"] = []
 
@@ -51,34 +104,58 @@ def load_whisper_model(model_size: str = "base"):
     # 音声モードの場合、Whisper 設定を表示
     if input_output_mode == "音声":
         st.subheader("音声認識設定 (オプション)")
-        with st.expander("Whisper設定", expanded=False):
-            selected_model = st.sidebar.selectbox(
-                "Whisperモデル",
-                [
-                    "tiny",
-                    "base",
-                    "small",
-                    "medium",
-                    "large",
-                ],
-                index=1,
-            )
-            transcription_language = st.sidebar.selectbox(
-                "文字起こし言語",
-                [
-                    "auto",
-                    "ja",
-                    "en",
-                ],
-                index=0,
-                help="autoは言語自動判定です",
-            )
-            st.markdown(
-                """
-                - Whisperモデルは大きいほど高精度ですが、処理に時間がかかります。
-                - 文字起こし言語を指定することで、認識精度が向上します。
-                """
-            )
+        selected_model = st.sidebar.selectbox(
+            "Whisperモデル",
+            [
+                "tiny",
+                "base",
+                "small",
+                "medium",
+                "large",
+            ],
+            index=1,
+        )
+        transcription_language = st.sidebar.selectbox(
+            "文字起こし言語",
+            [
+                "auto",
+                "ja",
+                "en",
+            ],
+            index=0,
+            help="autoは言語自動判定です",
+        )
+        tts_language = st.sidebar.selectbox(
+            "TTS言語",
+            [
+                "ja",
+                "en",
+                "fr",
+                "de",
+                "ko",
+                "zh-CN",
+            ],
+            index=0,
+        )
+        tts_speed = st.sidebar.slider(
+            "再生速度",
+            min_value=0.5,
+            max_value=2.0,
+            step=0.1,
+            value=1.0,
+        )
+        tts_pitch = st.sidebar.slider(
+            "ピッチ (半音)",
+            min_value=-12,
+            max_value=12,
+            value=0,
+        )
+        tts_volume = st.sidebar.slider(
+            "音量 (dB)",
+            min_value=-20,
+            max_value=10,
+            value=0,
+        )
 
     st.divider()
     st.subheader("使用するツール")
@@ -266,16 +343,14 @@ def load_whisper_model(model_size: str = "base"):
         # 音声モードの場合、音声出力を追加
         if input_output_mode == "音声":
             try:
-                # gTTSを使って音声生成
-                tts = gTTS(text=response_content, lang="ja")
-                with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio_file:
-                    tts.save(temp_audio_file.name)
-
-                    # 音声ファイルを読み込んでstreamlit audio widgetで再生
-                    with open(temp_audio_file.name, "rb") as audio_file:
-                        audio_bytes = audio_file.read()
-                        st.audio(audio_bytes, format="audio/mp3", autoplay=True)
-                    os.unlink(temp_audio_file.name)
-
+                with st.spinner("音声を生成中です..."):
+                    audio_bytes = synthesize_audio(
+                        text=response_content,
+                        language=tts_language,
+                        speed=tts_speed,
+                        pitch_shift=tts_pitch,
+                        volume_db=tts_volume,
+                    )
+                    st.audio(audio_bytes, format="audio/mp3", autoplay=True)
             except Exception as e:
                 st.warning(f"音声出力でエラーが発生しました: {e}")
diff --git a/uv.lock b/uv.lock