Skip to content

Commit dbb66ff

Browse files
Merge pull request #322 from EasyMetaAu/up-master
Improve Audio Pause Handling, MP3 Encoding, and Robust Text Normalization/Splitting
2 parents 543cbec + f7fb9c5 commit dbb66ff

File tree

9 files changed

+369
-185
lines changed

9 files changed

+369
-185
lines changed

api/src/services/audio.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,12 +80,12 @@ def find_first_last_non_silent(
8080
non_silent_index_start, non_silent_index_end = None, None
8181

8282
for X in range(0, len(audio_data)):
83-
if audio_data[X] > amplitude_threshold:
83+
if abs(audio_data[X]) > amplitude_threshold:
8484
non_silent_index_start = X
8585
break
8686

8787
for X in range(len(audio_data) - 1, -1, -1):
88-
if audio_data[X] > amplitude_threshold:
88+
if abs(audio_data[X]) > amplitude_threshold:
8989
non_silent_index_end = X
9090
break
9191

api/src/services/streaming_audio_writer.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,19 +32,29 @@ def __init__(self, format: str, sample_rate: int, channels: int = 1):
3232
if self.format in ["wav", "flac", "mp3", "pcm", "aac", "opus"]:
3333
if self.format != "pcm":
3434
self.output_buffer = BytesIO()
35+
container_options = {}
36+
# Try disabling Xing VBR header for MP3 to fix iOS timeline reading issues
37+
if self.format == 'mp3':
38+
# Disable Xing VBR header
39+
container_options = {'write_xing': '0'}
40+
logger.debug("Disabling Xing VBR header for MP3 encoding.")
41+
3542
self.container = av.open(
3643
self.output_buffer,
3744
mode="w",
3845
format=self.format if self.format != "aac" else "adts",
46+
options=container_options # Pass options here
3947
)
4048
self.stream = self.container.add_stream(
4149
codec_map[self.format],
42-
sample_rate=self.sample_rate,
50+
rate=self.sample_rate, # Correct parameter name is 'rate'
4351
layout="mono" if self.channels == 1 else "stereo",
4452
)
45-
self.stream.bit_rate = 128000
53+
# Set bit_rate only for codecs where it's applicable and useful
54+
if self.format in ['mp3', 'aac', 'opus']:
55+
self.stream.bit_rate = 128000 # Example bitrate, can be configured
4656
else:
47-
raise ValueError(f"Unsupported format: {format}")
57+
raise ValueError(f"Unsupported format: {self.format}") # Use self.format here
4858

4959
def close(self):
5060
if hasattr(self, "container"):
@@ -65,12 +75,18 @@ def write_chunk(
6575

6676
if finalize:
6777
if self.format != "pcm":
78+
# Flush stream encoder
6879
packets = self.stream.encode(None)
6980
for packet in packets:
7081
self.container.mux(packet)
7182

83+
# Closing the container handles writing the trailer and finalizing the file.
84+
# No explicit flush method is available or needed here.
85+
logger.debug("Muxed final packets.")
86+
87+
# Get the final bytes from the buffer *before* closing it
7288
data = self.output_buffer.getvalue()
73-
self.close()
89+
self.close() # Close container and buffer
7490
return data
7591

7692
if audio_data is None or len(audio_data) == 0:

api/src/services/text_processing/normalizer.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,7 @@ def handle_time(t: re.Match[str]) -> str:
391391

392392
def normalize_text(text: str, normalization_options: NormalizationOptions) -> str:
393393
"""Normalize text for TTS processing"""
394+
394395
# Handle email addresses first if enabled
395396
if normalization_options.email_normalization:
396397
text = EMAIL_PATTERN.sub(handle_email, text)
@@ -415,7 +416,7 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st
415416
text,
416417
)
417418

418-
# Replace quotes and brackets
419+
# Replace quotes and brackets (additional cleanup)
419420
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
420421
text = text.replace("«", chr(8220)).replace("»", chr(8221))
421422
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
@@ -435,6 +436,11 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st
435436
text = re.sub(r" +", " ", text)
436437
text = re.sub(r"(?<=\n) +(?=\n)", "", text)
437438

439+
# Handle special characters that might cause audio artifacts first
440+
# Replace newlines with spaces (or pauses if needed)
441+
text = text.replace('\n', ' ')
442+
text = text.replace('\r', ' ')
443+
438444
# Handle titles and abbreviations
439445
text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
440446
text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
@@ -445,7 +451,7 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st
445451
# Handle common words
446452
text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
447453

448-
# Handle numbers and money
454+
# Handle numbers and money BEFORE replacing special characters
449455
text = re.sub(r"(?<=\d),(?=\d)", "", text)
450456

451457
text = MONEY_PATTERN.sub(
@@ -457,6 +463,22 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st
457463

458464
text = re.sub(r"\d*\.\d+", handle_decimal, text)
459465

466+
# Handle other problematic symbols AFTER money/number processing
467+
text = text.replace('~', '') # Remove tilde
468+
text = text.replace('@', ' at ') # At symbol
469+
text = text.replace('#', ' number ') # Hash/pound
470+
text = text.replace('$', ' dollar ') # Dollar sign (if not handled by money pattern)
471+
text = text.replace('%', ' percent ') # Percent sign
472+
text = text.replace('^', '') # Caret
473+
text = text.replace('&', ' and ') # Ampersand
474+
text = text.replace('*', '') # Asterisk
475+
text = text.replace('_', ' ') # Underscore to space
476+
text = text.replace('|', ' ') # Pipe to space
477+
text = text.replace('\\', ' ') # Backslash to space
478+
text = text.replace('/', ' slash ') # Forward slash to space (unless in URLs)
479+
text = text.replace('=', ' equals ') # Equals sign
480+
text = text.replace('+', ' plus ') # Plus sign
481+
460482
# Handle various formatting
461483
text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)
462484
text = re.sub(r"(?<=\d)S", " S", text)

api/src/services/text_processing/phonemizer.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import phonemizer
55

66
from .normalizer import normalize_text
7+
from ...structures.schemas import NormalizationOptions
78

89
phonemizers = {}
910

@@ -95,8 +96,20 @@ def phonemize(text: str, language: str = "a", normalize: bool = True) -> str:
9596
Phonemized text
9697
"""
9798
global phonemizers
99+
100+
# Strip input text first to remove problematic leading/trailing spaces
101+
text = text.strip()
102+
98103
if normalize:
99-
text = normalize_text(text)
104+
# Create default normalization options and normalize text
105+
normalization_options = NormalizationOptions()
106+
text = normalize_text(text, normalization_options)
107+
# Strip again after normalization
108+
text = text.strip()
109+
100110
if language not in phonemizers:
101111
phonemizers[language] = create_phonemizer(language)
102-
return phonemizers[language].phonemize(text)
112+
113+
result = phonemizers[language].phonemize(text)
114+
# Final strip to ensure no leading/trailing spaces in phonemes
115+
return result.strip()

0 commit comments

Comments
 (0)