Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions assets/translations/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@
"tuningItemTTSVoice": "TTS Voice",
"tuningItemTTSJaVoiceName": "Japanese voice name",
"tuningItemTTSEnVoiceName": "English voice name",
"tuningItemTTSJaPrompt": "Japanese prompt",
"tuningItemTTSEnPrompt": "English prompt",
"tuningItemLocationAccuracy": "location accuracy",
"headerDelayTooShortErrorText": "Header animation delay must be greater than the Header animation duration.",
"nanErrorText": "The value must be entered numerically.",
Expand Down
2 changes: 2 additions & 0 deletions assets/translations/ja.json
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,8 @@
"tuningItemTTSVoice": "TTS ボイス",
"tuningItemTTSJaVoiceName": "日本語ボイス名",
"tuningItemTTSEnVoiceName": "英語ボイス名",
"tuningItemTTSJaPrompt": "日本語プロンプト",
"tuningItemTTSEnPrompt": "英語プロンプト",
"tuningItemLocationAccuracy": "位置情報の精度",
"headerDelayTooShortErrorText": "ヘッダーアニメーション遅延はヘッダーアニメーション時間より大きい値である必要があります。",
"nanErrorText": "値は数値で入力してください。",
Expand Down
127 changes: 86 additions & 41 deletions functions/src/funcs/tts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,34 @@ const googleAuth = new GoogleAuth({
scopes: ['https://www.googleapis.com/auth/cloud-platform'],
});

const GEMINI_TTS_MODEL = 'gemini-2.5-flash-tts';
const GOOGLE_TTS_API_VERSION = 'v1';
const DEFAULT_JA_VOICE_NAME = 'ja-JP-Neural2-B';
const DEFAULT_EN_VOICE_NAME = 'en-US-Neural2-F';

const TTS_CONFIG_CACHE_TTL_MS = 5 * 60 * 1000; // 5分
let ttsConfigCache: {
data: FirebaseFirestore.DocumentData | undefined;
fetchedAt: number;
} | null = null;

const getTtsConfig = async (): Promise<
FirebaseFirestore.DocumentData | undefined
> => {
if (ttsConfigCache && Date.now() - ttsConfigCache.fetchedAt < TTS_CONFIG_CACHE_TTL_MS) {
return ttsConfigCache.data;
}
try {
const doc = await firestore.collection('configs').doc('tts').get();
const data = doc.data();
ttsConfigCache = { data, fetchedAt: Date.now() };
return data;
} catch (e) {
if (ttsConfigCache) {
return ttsConfigCache.data;
}
throw e;
}
};

interface SynthesizedAudio {
audioContent: string;
Expand Down Expand Up @@ -80,24 +106,27 @@ const stripSsml = (text: string): string =>
.replace(/\s{2,}/g, ' ')
.trim();

/** Cloud Text-to-Speech の Gemini-TTS を使用してテキストを音声に変換する。 */
const synthesizeWithGemini = async (
const getAccessToken = async (): Promise<string> => {
const client = await googleAuth.getClient();
const accessTokenResponse = await client.getAccessToken();
const token = accessTokenResponse.token;
if (!token) {
throw new Error('Failed to acquire Google access token for TTS');
}
return token;
};

/** Cloud Text-to-Speech の Neural2 を使用してテキストを音声に変換する。 */
const synthesizeWithNeural2 = async (
projectId: string,
accessToken: string,
text: string,
languageCode: string,
voiceName: string,
prompt?: string,
options?: {
volumeGainDb?: number;
}
): Promise<SynthesizedAudio> => {
const client = await googleAuth.getClient();
const accessTokenResponse = await client.getAccessToken();
const accessToken = accessTokenResponse.token;
if (!accessToken) {
throw new Error('Failed to acquire Google access token for Gemini TTS');
}

const ttsUrl = `https://texttospeech.googleapis.com/${GOOGLE_TTS_API_VERSION}/text:synthesize`;
const res = await fetch(ttsUrl, {
headers: {
Expand All @@ -107,13 +136,11 @@ const synthesizeWithGemini = async (
},
body: JSON.stringify({
input: {
text: stripSsml(text),
...(prompt ? { prompt } : {}),
ssml: text,
},
voice: {
languageCode,
name: voiceName,
modelName: GEMINI_TTS_MODEL,
},
audioConfig: {
audioEncoding: 'MP3',
Expand All @@ -132,7 +159,7 @@ const synthesizeWithGemini = async (
};
if (!res.ok || !json.audioContent) {
throw new Error(
`Gemini TTS API returned ${res.status}: ${JSON.stringify(json.error ?? json)}`
`Neural2 TTS API returned ${res.status}: ${JSON.stringify(json.error ?? json)}`
);
}

Expand Down Expand Up @@ -180,36 +207,66 @@ export const tts = onCall(

let ttsConfig: FirebaseFirestore.DocumentData | undefined;
try {
const ttsConfigDoc = await firestore
.collection('configs')
.doc('tts')
.get();
ttsConfig = ttsConfigDoc.data();
ttsConfig = await getTtsConfig();
} catch (e) {
console.warn(
'Failed to read TTS config from Firestore, using defaults:',
e
);
}
const defaultJaVoice = ttsConfig?.jaVoiceName || 'Aoede';
const defaultEnVoice = ttsConfig?.enVoiceName || 'Aoede';
const defaultJaVoice = ttsConfig?.jaVoiceName || DEFAULT_JA_VOICE_NAME;
const defaultEnVoice = ttsConfig?.enVoiceName || DEFAULT_EN_VOICE_NAME;

const jaVoiceName =
(typeof req.data.jaVoiceName === 'string' && req.data.jaVoiceName) ||
(typeof req.data.jaVoiceName === 'string' &&
req.data.jaVoiceName.trim()) ||
defaultJaVoice;
const enVoiceName =
(typeof req.data.enVoiceName === 'string' && req.data.enVoiceName) ||
(typeof req.data.enVoiceName === 'string' &&
req.data.enVoiceName.trim()) ||
defaultEnVoice;

const strippedJa = stripSsml(ssmlJa);
const strippedEn = stripSsml(ssmlEn);

if (strippedJa.trim().length === 0) {
throw new HttpsError(
'invalid-argument',
'ssmlJa contains no visible text after stripping SSML tags'
);
}
if (strippedEn.trim().length === 0) {
throw new HttpsError(
'invalid-argument',
'ssmlEn contains no visible text after stripping SSML tags'
);
}

const TEXT_BYTE_LIMIT = 4000;
const jaTextBytes = Buffer.byteLength(strippedJa, 'utf8');
const enTextBytes = Buffer.byteLength(strippedEn, 'utf8');

if (jaTextBytes > TEXT_BYTE_LIMIT) {
throw new HttpsError(
'invalid-argument',
`ssmlJa text exceeds ${TEXT_BYTE_LIMIT} byte limit (${jaTextBytes} bytes)`
);
}
if (enTextBytes > TEXT_BYTE_LIMIT) {
throw new HttpsError(
'invalid-argument',
`ssmlEn text exceeds ${TEXT_BYTE_LIMIT} byte limit (${enTextBytes} bytes)`
);
}

const voicesCollection = firestore
.collection('caches')
.doc('tts')
.collection('voices');

const hashAlgorithm = 'sha256';
const version = 9;
const version = 11;
const hashPayloadObj = {
enModel: GEMINI_TTS_MODEL,
enVoiceName,
jaVoiceName,
ssmlEn,
Expand Down Expand Up @@ -276,21 +333,10 @@ export const tts = onCall(
}

try {
const accessToken = await getAccessToken();
const [jaAudio, enAudio] = await Promise.all([
synthesizeWithGemini(
projectId,
ssmlJa,
'ja-JP',
jaVoiceName,
'以下の日本語を、現代的な鉄道自動放送のように読み上げてください。全体的に平板なイントネーションを維持し、感情を込めず淡々と読んでください。文のイントネーションは文末に向かって自然に下降させてください。助詞(は、の、で、を等)で不自然にピッチを上げないでください。駅名や路線名は平板アクセントで読んでください(一般会話のアクセントとは異なります)。無駄な間を入れず、一定のテンポで読み進めてください。漢字の読みは一文字も省略せず正確に読んでください。特に路線名は正式な読みに従ってください(例:副都心線→ふくとしんせん、東海道線→とうかいどうせん、山手線→やまのてせん)。鉄道会社の略称も正確に読んでください(例:名鉄→めいてつ、京急→けいきゅう、京王→けいおう、阪急→はんきゅう、阪神→はんしん、南海→なんかい、近鉄→きんてつ、西鉄→にしてつ、東急→とうきゅう、小田急→おだきゅう、京成→けいせい、相鉄→そうてつ)。'
),
synthesizeWithGemini(
projectId,
ssmlEn,
'en-US',
enVoiceName,
'Read the following in a calm, clear, and composed tone like a modern train announcement. Speak quickly and crisply with a swift, efficient delivery. Do not linger on words or pause unnecessarily. Maintain a steady, relaxed intonation despite the fast pace. The text contains Japanese railway station names and line names in romanized form. Pronounce them using Japanese vowel rules, NOT English rules: a=ah, i=ee, u=oo, e=eh, o=oh. Every vowel is always pronounced the same way regardless of surrounding letters (e.g. "Inage" = ee-nah-geh, NOT "inn-idge"; "Meguro" = meh-goo-roh; "Ebisu" = eh-bee-soo; "Ome" = oh-meh, NOT "ohm"). Never apply English spelling conventions like silent e, soft g, or vowel shifts to these names.'
),
synthesizeWithNeural2(projectId, accessToken, ssmlJa, 'ja-JP', jaVoiceName),
synthesizeWithNeural2(projectId, accessToken, ssmlEn, 'en-US', enVoiceName),
]);
const jaAudioContent = jaAudio.audioContent;
const jaAudioMimeType = jaAudio.mimeType || 'audio/mpeg';
Expand All @@ -310,7 +356,6 @@ export const tts = onCall(
ssmlEn,
voiceJa: jaVoiceName,
voiceEn: enVoiceName,
enModel: GEMINI_TTS_MODEL,
},
})
.catch((err) => {
Expand Down
Loading
Loading