-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate_tts.py
More file actions
120 lines (96 loc) · 3.93 KB
/
generate_tts.py
File metadata and controls
120 lines (96 loc) · 3.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import sys
import wave
from typing import Optional
from google import genai
from google.genai import types
try:
from dotenv import load_dotenv # type: ignore
load_dotenv()
except Exception:
pass
def wave_file(filename: str, pcm: bytes, channels: int = 1, rate: int = 24000, sample_width: int = 2) -> None:
with wave.open(filename, "wb") as wf:
wf.setnchannels(channels)
wf.setsampwidth(sample_width)
wf.setframerate(rate)
wf.writeframes(pcm)
def init_client() -> genai.Client:
credentials_path = os.getenv("VERTEX_AI_CREDENTIALS_PATH")
if credentials_path and not os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.abspath(credentials_path)
project_id = os.getenv("VERTEX_AI_PROJECT_ID")
location = os.getenv("VERTEX_AI_LOCATION", "us-central1")
if not project_id:
print("Missing VERTEX_AI_PROJECT_ID in environment/.env")
sys.exit(1)
return genai.Client(vertexai=True, project=project_id, location=location)
def _tts_gemini(text: str, voice_name: str) -> bytes:
client = init_client()
response = client.models.generate_content(
model="gemini-2.5-flash-preview-tts",
contents=text,
config=types.GenerateContentConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name=voice_name,
)
)
),
),
)
part = response.candidates[0].content.parts[0]
data = getattr(getattr(part, "inline_data", None), "data", None)
if not isinstance(data, (bytes, bytearray)):
data = bytes(data)
return data
def _tts_cloud(text: str, voice_name: str) -> bytes:
# Fallback using Google Cloud Text-to-Speech
try:
from google.cloud import texttospeech # type: ignore
except Exception:
print("Missing dependency google-cloud-texttospeech. Install with: pip install google-cloud-texttospeech")
sys.exit(1)
# Ensure ADC is available even when not calling init_client()
credentials_path = os.getenv("VERTEX_AI_CREDENTIALS_PATH")
if credentials_path and not os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.abspath(credentials_path)
client = texttospeech.TextToSpeechClient()
# Map Gemini voice to Cloud TTS voice if needed; default to en-US-Neural2-C
mapped_voice = {
"Kore": "en-US-Neural2-C",
}.get(voice_name, "en-US-Neural2-C")
synthesis_input = texttospeech.SynthesisInput(text=text)
voice = texttospeech.VoiceSelectionParams(
language_code="en-US",
name=mapped_voice,
)
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.LINEAR16,
sample_rate_hertz=24000,
)
response = client.synthesize_speech(
input=synthesis_input, voice=voice, audio_config=audio_config
)
return response.audio_content
def generate_tts(text: str, voice_name: str = "Kore", outfile: str = "out.wav", backend: str = "auto") -> str:
data: Optional[bytes] = None
if backend in ("auto", "gemini"):
try:
data = _tts_gemini(text, voice_name)
except Exception as e:
# Fall back if not allowlisted or any Gemini error
if backend == "gemini":
print(f"Gemini TTS failed: {e}")
sys.exit(1)
data = None
if data is None:
data = _tts_cloud(text, voice_name)
wave_file(outfile, data)
return outfile
if __name__ == "__main__":
text = os.getenv("TTS_TEXT", "Say cheerfully: Have a wonderful day!")
out = generate_tts(text=text, voice_name=os.getenv("TTS_VOICE", "Kore"), outfile=os.getenv("TTS_OUT", "out.wav"))
print(f"Saved TTS to {out}")