kprinssu
diff --git a/‎api/src/routers/development.py‎
Lines changed: 10 additions & 6 deletions b/‎api/src/routers/development.py‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎api/src/services/text_processing/normalizer.py‎
Lines changed: 96 additions & 41 deletions b/‎api/src/services/text_processing/normalizer.py‎
Lines changed: 96 additions & 41 deletions
@@ -104,7 +104,7 @@ async def generate_chunks():
 
                 if chunk_audio is not None:
                     # Normalize audio before writing
-                    normalized_audio = await normalizer.normalize(chunk_audio)
+                    normalized_audio = normalizer.normalize(chunk_audio)
                     # Write chunk and yield bytes
                     chunk_bytes = writer.write_chunk(normalized_audio)
                     if chunk_bytes:
@@ -114,6 +114,7 @@ async def generate_chunks():
                     final_bytes = writer.write_chunk(finalize=True)
                     if final_bytes:
                         yield final_bytes
+                        writer.close()
                 else:
                     raise ValueError("Failed to generate audio data")
 
@@ -223,10 +224,13 @@ async def dual_output():
                                 ).decode("utf-8")
 
                                 # Add any chunks that may be in the acumulator into the return word_timestamps
-                                chunk_data.word_timestamps = (
-                                    timestamp_acumulator + chunk_data.word_timestamps
-                                )
-                                timestamp_acumulator = []
+                                if chunk_data.word_timestamps is not None:
+                                    chunk_data.word_timestamps = (
+                                        timestamp_acumulator + chunk_data.word_timestamps
+                                    )
+                                    timestamp_acumulator = []
+                                else:
+                                    chunk_data.word_timestamps = []
 
                                 yield CaptionedSpeechResponse(
                                     audio=base64_chunk,
@@ -271,7 +275,7 @@ async def single_output():
                             )
 
                             # Add any chunks that may be in the acumulator into the return word_timestamps
-                            if chunk_data.word_timestamps != None:
+                            if chunk_data.word_timestamps is not None:
                                 chunk_data.word_timestamps = (
                                     timestamp_acumulator + chunk_data.word_timestamps
                                 )
 
@@ -4,8 +4,10 @@
 Converts them into a format suitable for text-to-speech processing.
 """
 
+import math
 import re
 from functools import lru_cache
+from typing import List, Optional, Union
 
 import inflect
 from numpy import number
@@ -132,6 +134,7 @@
     "px": "pixel",  # CSS units
 }
 
+MONEY_UNITS = {"$": ("dollar", "cent"), "£": ("pound", "pence"), "€": ("euro", "cent")}
 
 # Pre-compiled regex patterns for performance
 EMAIL_PATTERN = re.compile(
@@ -152,35 +155,22 @@
 )
 
 TIME_PATTERN = re.compile(
-    r"([0-9]{2} ?: ?[0-9]{2}( ?: ?[0-9]{2})?)( ?(pm|am)\b)?", re.IGNORECASE
+    r"([0-9]{1,2} ?: ?[0-9]{2}( ?: ?[0-9]{2})?)( ?(pm|am)\b)?", re.IGNORECASE
 )
 
-INFLECT_ENGINE = inflect.engine()
+MONEY_PATTERN = re.compile(
+    r"(-?)(["
+    + "".join(MONEY_UNITS.keys())
+    + r"])(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion|k|m|b|t)*)\b",
+    re.IGNORECASE,
+)
 
+NUMBER_PATTERN = re.compile(
+    r"(-?)(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion|k|m|b)*)\b",
+    re.IGNORECASE,
+)
 
-def split_num(num: re.Match[str]) -> str:
-    """Handle number splitting for various formats"""
-    num = num.group()
-    if "." in num:
-        return num
-    elif ":" in num:
-        h, m = [int(n) for n in num.split(":")]
-        if m == 0:
-            return f"{h} o'clock"
-        elif m < 10:
-            return f"{h} oh {m}"
-        return f"{h} {m}"
-    year = int(num[:4])
-    if year < 1100 or year % 1000 < 10:
-        return num
-    left, right = num[:2], int(num[2:4])
-    s = "s" if num.endswith("s") else ""
-    if 100 <= year % 1000 <= 999:
-        if right == 0:
-            return f"{left} hundred{s}"
-        elif right < 10:
-            return f"{left} oh {right}{s}"
-    return f"{left} {right}{s}"
+INFLECT_ENGINE = inflect.engine()
 
 
 def handle_units(u: re.Match[str]) -> str:
@@ -208,14 +198,61 @@ def conditional_int(number: float, threshold: float = 0.00001):
     return number
 
 
+def translate_multiplier(multiplier: str) -> str:
+    """Translate multiplier abrevations to words"""
+
+    multiplier_translation = {
+        "k": "thousand",
+        "m": "million",
+        "b": "billion",
+        "t": "trillion",
+    }
+    if multiplier.lower() in multiplier_translation:
+        return multiplier_translation[multiplier.lower()]
+    return multiplier.strip()
+
+
+def split_four_digit(number: float):
+    part1 = str(conditional_int(number))[:2]
+    part2 = str(conditional_int(number))[2:]
+    return f"{INFLECT_ENGINE.number_to_words(part1)} {INFLECT_ENGINE.number_to_words(part2)}"
+
+
+def handle_numbers(n: re.Match[str]) -> str:
+    number = n.group(2)
+
+    try:
+        number = float(number)
+    except:
+        return n.group()
+
+    if n.group(1) == "-":
+        number *= -1
+
+    multiplier = translate_multiplier(n.group(3))
+
+    number = conditional_int(number)
+    if multiplier != "":
+        multiplier = f" {multiplier}"
+    else:
+        if (
+            number % 1 == 0
+            and len(str(number)) == 4
+            and number > 1500
+            and number % 1000 > 9
+        ):
+            return split_four_digit(number)
+
+    return f"{INFLECT_ENGINE.number_to_words(number)}{multiplier}"
+
+
 def handle_money(m: re.Match[str]) -> str:
     """Convert money expressions to spoken form"""
 
-    bill = "dollar" if m.group(2) == "$" else "pound"
-    coin = "cent" if m.group(2) == "$" else "pence"
+    bill, coin = MONEY_UNITS[m.group(2)]
+
     number = m.group(3)
 
-    multiplier = m.group(4)
     try:
         number = float(number)
     except:
@@ -224,12 +261,17 @@ def handle_money(m: re.Match[str]) -> str:
     if m.group(1) == "-":
         number *= -1
 
+    multiplier = translate_multiplier(m.group(4))
+
+    if multiplier != "":
+        multiplier = f" {multiplier}"
+
     if number % 1 == 0 or multiplier != "":
         text_number = f"{INFLECT_ENGINE.number_to_words(conditional_int(number))}{multiplier} {INFLECT_ENGINE.plural(bill, count=number)}"
     else:
         sub_number = int(str(number).split(".")[-1].ljust(2, "0"))
 
-        text_number = f"{INFLECT_ENGINE.number_to_words(int(round(number)))} {INFLECT_ENGINE.plural(bill, count=number)} and {INFLECT_ENGINE.number_to_words(sub_number)} {INFLECT_ENGINE.plural(coin, count=sub_number)}"
+        text_number = f"{INFLECT_ENGINE.number_to_words(int(math.floor(number)))} {INFLECT_ENGINE.plural(bill, count=number)} and {INFLECT_ENGINE.number_to_words(sub_number)} {INFLECT_ENGINE.plural(coin, count=sub_number)}"
 
     return text_number
 
@@ -320,15 +362,31 @@ def handle_phone_number(p: re.Match[str]) -> str:
 def handle_time(t: re.Match[str]) -> str:
     t = t.groups()
 
-    numbers = " ".join(
-        [INFLECT_ENGINE.number_to_words(X.strip()) for X in t[0].split(":")]
-    )
+    time_parts = t[0].split(":")
+
+    numbers = []
+    numbers.append(INFLECT_ENGINE.number_to_words(time_parts[0].strip()))
+
+    minute_number = INFLECT_ENGINE.number_to_words(time_parts[1].strip())
+    if int(time_parts[1]) < 10:
+        if int(time_parts[1]) != 0:
+            numbers.append(f"oh {minute_number}")
+    else:
+        numbers.append(minute_number)
 
     half = ""
-    if t[2] is not None:
-        half = t[2].strip()
+    if len(time_parts) > 2:
+        seconds_number = INFLECT_ENGINE.number_to_words(time_parts[2].strip())
+        second_word = INFLECT_ENGINE.plural("second", int(time_parts[2].strip()))
+        numbers.append(f"and {seconds_number} {second_word}")
+    else:
+        if t[2] is not None:
+            half = " " + t[2].strip()
+        else:
+            if int(time_parts[1]) == 0:
+                numbers.append("o'clock")
 
-    return numbers + half
+    return " ".join(numbers) + half
 
 
 def normalize_text(text: str, normalization_options: NormalizationOptions) -> str:
@@ -366,7 +424,7 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st
     for a, b in zip("、。！，：；？–", ",.!,:;?-"):
         text = text.replace(a, b + " ")
 
-    # Handle simple time in the format of HH:MM:SS
+    # Handle simple time in the format of HH:MM:SS (am/pm)
     text = TIME_PATTERN.sub(
         handle_time,
         text,
@@ -390,15 +448,12 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st
     # Handle numbers and money
     text = re.sub(r"(?<=\d),(?=\d)", "", text)
 
-    text = re.sub(
-        r"(?i)(-?)([$£])(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion)*)\b",
+    text = MONEY_PATTERN.sub(
         handle_money,
         text,
     )
 
-    text = re.sub(
-        r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)", split_num, text
-    )
+    text = NUMBER_PATTERN.sub(handle_numbers, text)
 
     text = re.sub(r"\d*\.\d+", handle_decimal, text)