Skip to content

Commit ac01fc8

Browse files
committed
Open AI limitation of 4096 characters was making small audio that was truncated from the full summary.
1 parent 4a9a1ad commit ac01fc8

File tree

2 files changed

+247
-119
lines changed

2 files changed

+247
-119
lines changed

services/tts.py

Lines changed: 139 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
import re
88
import logging
9-
from typing import Optional, Literal
9+
from typing import Optional, Literal, List
1010
from elevenlabs.client import ElevenLabs
1111
from bs4 import BeautifulSoup
1212

@@ -97,6 +97,61 @@ def extract_summary_text_for_tts(html_content: str) -> str:
9797
return text
9898

9999

100+
def _split_text_into_chunks(text: str, max_chars: int) -> List[str]:
101+
"""
102+
Split text into chunks at sentence/paragraph boundaries, each under max_chars.
103+
104+
Splits preferably at paragraph breaks (double newline), then sentence ends
105+
(. ! ?). If a single sentence still exceeds max_chars, it is split at the
106+
last space before the limit.
107+
108+
Args:
109+
text: Text to split
110+
max_chars: Maximum characters per chunk
111+
112+
Returns:
113+
List of text chunks, each at most max_chars characters
114+
"""
115+
if len(text) <= max_chars:
116+
return [text]
117+
118+
# Split on sentence-ending punctuation followed by spaces, or paragraph breaks.
119+
# The lookbehind keeps the punctuation attached to the preceding sentence.
120+
parts = re.split(r"(?<=[.!?]) +|\n\n", text)
121+
122+
chunks: List[str] = []
123+
current_chunk = ""
124+
125+
for part in parts:
126+
part = part.strip()
127+
if not part:
128+
continue
129+
130+
candidate = (current_chunk + " " + part).strip() if current_chunk else part
131+
132+
if len(candidate) <= max_chars:
133+
current_chunk = candidate
134+
else:
135+
# Flush the accumulated chunk
136+
if current_chunk:
137+
chunks.append(current_chunk)
138+
139+
# If the part itself is too long, hard-split at word boundaries
140+
while len(part) > max_chars:
141+
split_at = part.rfind(" ", 0, max_chars)
142+
if split_at == -1:
143+
split_at = max_chars
144+
chunks.append(part[:split_at])
145+
part = part[split_at:].lstrip()
146+
147+
current_chunk = part
148+
149+
if current_chunk:
150+
chunks.append(current_chunk)
151+
152+
return chunks
153+
154+
100155
def _generate_audio_openai(
101156
text: str,
102157
voice: str,
@@ -122,40 +177,44 @@ def _generate_audio_openai(
122177
123178
Note:
124179
OpenAI TTS has a 4096 character limit per request.
125-
For longer text, this function will truncate.
180+
Longer text is split into chunks at sentence boundaries and the
181+
resulting audio bytes are concatenated into a single MP3 stream.
126182
"""
127183
max_chars = 4096
184+
chunks = _split_text_into_chunks(text, max_chars)
128185

129-
# Truncate if needed
130-
if len(text) > max_chars:
131-
text = text[: max_chars - 3] + "..."
186+
if len(chunks) > 1:
187+
logger.info(
188+
f"Text split into {len(chunks)} chunks for OpenAI TTS ({len(text)} chars total)"
189+
)
132190

133-
try:
134-
# Use tracked client for automatic usage tracking
135-
client = get_tracked_openai_client()
191+
client = get_tracked_openai_client()
192+
audio_parts: List[bytes] = []
136193

137-
response = client.text_to_speech(
138-
text=text,
139-
voice=voice,
140-
model=model,
141-
feature=feature,
142-
video_id=video_id,
143-
)
194+
for chunk in chunks:
195+
try:
196+
response = client.text_to_speech(
197+
text=chunk,
198+
voice=voice,
199+
model=model,
200+
feature=feature,
201+
video_id=video_id,
202+
)
203+
audio_parts.append(response.read())
144204

145-
# Read the audio data from the response
146-
return response.read()
205+
except Exception as e:
206+
error_msg = str(e)
147207

148-
except Exception as e:
149-
error_msg = str(e)
150-
151-
if "401" in error_msg or "unauthorized" in error_msg.lower():
152-
raise TTSAPIError("Invalid OpenAI API key")
153-
elif "429" in error_msg or "rate" in error_msg.lower():
154-
raise TTSAPIError(f"Rate limited: {error_msg}")
155-
elif "insufficient_quota" in error_msg.lower():
156-
raise TTSAPIError(f"Insufficient quota: {error_msg}")
157-
else:
158-
raise TTSAPIError(f"OpenAI TTS failed: {error_msg}")
208+
if "401" in error_msg or "unauthorized" in error_msg.lower():
209+
raise TTSAPIError("Invalid OpenAI API key")
210+
elif "429" in error_msg or "rate" in error_msg.lower():
211+
raise TTSAPIError(f"Rate limited: {error_msg}")
212+
elif "insufficient_quota" in error_msg.lower():
213+
raise TTSAPIError(f"Insufficient quota: {error_msg}")
214+
else:
215+
raise TTSAPIError(f"OpenAI TTS failed: {error_msg}")
216+
217+
return b"".join(audio_parts)
159218

160219

161220
def _generate_audio_elevenlabs(
@@ -198,61 +257,66 @@ def _generate_audio_elevenlabs(
198257
"eleven_monolingual_v1": 5000,
199258
}
200259
max_chars = model_limits.get(model_id, 10000)
260+
chunks = _split_text_into_chunks(text, max_chars)
201261

202-
# Truncate if needed
203-
if len(text) > max_chars:
204-
text = text[: max_chars - 3] + "..."
205-
206-
try:
207-
client = ElevenLabs(api_key=api_key)
208-
209-
audio_generator = client.text_to_speech.convert(
210-
text=text,
211-
voice_id=voice_id,
212-
model_id=model_id,
213-
output_format=output_format,
262+
if len(chunks) > 1:
263+
logger.info(
264+
f"Text split into {len(chunks)} chunks for ElevenLabs TTS ({len(text)} chars total)"
214265
)
215266

216-
audio_data = b"".join(audio_generator)
267+
client = ElevenLabs(api_key=api_key)
268+
audio_parts: List[bytes] = []
269+
total_chars = 0
217270

218-
# Track usage - ElevenLabs TTS priced per character
271+
for chunk in chunks:
219272
try:
220-
metadata = {
221-
"character_count": len(text),
222-
"voice_id": voice_id,
223-
"output_format": output_format,
224-
}
225-
226-
log_llm_usage(
227-
provider="elevenlabs",
228-
model=model_id,
229-
feature=feature,
230-
prompt_tokens=len(text), # Store character count in prompt_tokens
231-
response_tokens=0, # TTS doesn't have response tokens
232-
video_id=video_id,
233-
metadata=metadata,
234-
)
235-
logger.info(
236-
f"ElevenLabs TTS {model_id} call tracked for {feature} ({len(text)} chars)"
273+
audio_generator = client.text_to_speech.convert(
274+
text=chunk,
275+
voice_id=voice_id,
276+
model_id=model_id,
277+
output_format=output_format,
237278
)
238-
except Exception as e:
239-
logger.warning(f"Failed to track ElevenLabs TTS usage: {e}")
240-
241-
return audio_data
279+
audio_parts.append(b"".join(audio_generator))
280+
total_chars += len(chunk)
242281

282+
except Exception as e:
283+
error_msg = str(e)
284+
285+
if "quota_exceeded" in error_msg.lower():
286+
raise TTSAPIError(f"Quota exceeded: {error_msg}")
287+
elif "401" in error_msg or "unauthorized" in error_msg.lower():
288+
raise TTSAPIError("Invalid ElevenLabs API key")
289+
elif "402" in error_msg or "payment_required" in error_msg.lower():
290+
raise TTSAPIError(f"Payment required: {error_msg}")
291+
elif "429" in error_msg or "rate" in error_msg.lower():
292+
raise TTSAPIError(f"Rate limited: {error_msg}")
293+
else:
294+
raise TTSAPIError(f"ElevenLabs TTS failed: {error_msg}")
295+
296+
# Track usage - ElevenLabs TTS priced per character
297+
try:
298+
metadata = {
299+
"character_count": total_chars,
300+
"voice_id": voice_id,
301+
"output_format": output_format,
302+
}
303+
304+
log_llm_usage(
305+
provider="elevenlabs",
306+
model=model_id,
307+
feature=feature,
308+
prompt_tokens=total_chars, # Store character count in prompt_tokens
309+
response_tokens=0, # TTS doesn't have response tokens
310+
video_id=video_id,
311+
metadata=metadata,
312+
)
313+
logger.info(
314+
f"ElevenLabs TTS {model_id} call tracked for {feature} ({total_chars} chars)"
315+
)
243316
except Exception as e:
244-
error_msg = str(e)
245-
246-
if "quota_exceeded" in error_msg.lower():
247-
raise TTSAPIError(f"Quota exceeded: {error_msg}")
248-
elif "401" in error_msg or "unauthorized" in error_msg.lower():
249-
raise TTSAPIError("Invalid ElevenLabs API key")
250-
elif "402" in error_msg or "payment_required" in error_msg.lower():
251-
raise TTSAPIError(f"Payment required: {error_msg}")
252-
elif "429" in error_msg or "rate" in error_msg.lower():
253-
raise TTSAPIError(f"Rate limited: {error_msg}")
254-
else:
255-
raise TTSAPIError(f"ElevenLabs TTS failed: {error_msg}")
317+
logger.warning(f"Failed to track ElevenLabs TTS usage: {e}")
318+
319+
return b"".join(audio_parts)
256320

257321

258322
def generate_audio(

0 commit comments

Comments
 (0)