44Converts them into a format suitable for text-to-speech processing.
55"""
66
7+ import math
78import re
89from functools import lru_cache
10+ from typing import List , Optional , Union
911
1012import inflect
1113from numpy import number
132134 "px" : "pixel" , # CSS units
133135}
134136
137+ MONEY_UNITS = {"$" : ("dollar" , "cent" ), "£" : ("pound" , "pence" ), "€" : ("euro" , "cent" )}
135138
136139# Pre-compiled regex patterns for performance
137140EMAIL_PATTERN = re .compile (
152155)
153156
154157TIME_PATTERN = re .compile (
155- r"([0-9]{2} ?: ?[0-9]{2}( ?: ?[0-9]{2})?)( ?(pm|am)\b)?" , re .IGNORECASE
158+ r"([0-9]{1, 2} ?: ?[0-9]{2}( ?: ?[0-9]{2})?)( ?(pm|am)\b)?" , re .IGNORECASE
156159)
157160
158- INFLECT_ENGINE = inflect .engine ()
161+ MONEY_PATTERN = re .compile (
162+ r"(-?)(["
163+ + "" .join (MONEY_UNITS .keys ())
164+ + r"])(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion|k|m|b|t)*)\b" ,
165+ re .IGNORECASE ,
166+ )
159167
168+ NUMBER_PATTERN = re .compile (
169+ r"(-?)(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion|k|m|b)*)\b" ,
170+ re .IGNORECASE ,
171+ )
160172
161- def split_num (num : re .Match [str ]) -> str :
162- """Handle number splitting for various formats"""
163- num = num .group ()
164- if "." in num :
165- return num
166- elif ":" in num :
167- h , m = [int (n ) for n in num .split (":" )]
168- if m == 0 :
169- return f"{ h } o'clock"
170- elif m < 10 :
171- return f"{ h } oh { m } "
172- return f"{ h } { m } "
173- year = int (num [:4 ])
174- if year < 1100 or year % 1000 < 10 :
175- return num
176- left , right = num [:2 ], int (num [2 :4 ])
177- s = "s" if num .endswith ("s" ) else ""
178- if 100 <= year % 1000 <= 999 :
179- if right == 0 :
180- return f"{ left } hundred{ s } "
181- elif right < 10 :
182- return f"{ left } oh { right } { s } "
183- return f"{ left } { right } { s } "
173+ INFLECT_ENGINE = inflect .engine ()
184174
185175
186176def handle_units (u : re .Match [str ]) -> str :
@@ -208,14 +198,61 @@ def conditional_int(number: float, threshold: float = 0.00001):
208198 return number
209199
210200
201+ def translate_multiplier (multiplier : str ) -> str :
202+ """Translate multiplier abrevations to words"""
203+
204+ multiplier_translation = {
205+ "k" : "thousand" ,
206+ "m" : "million" ,
207+ "b" : "billion" ,
208+ "t" : "trillion" ,
209+ }
210+ if multiplier .lower () in multiplier_translation :
211+ return multiplier_translation [multiplier .lower ()]
212+ return multiplier .strip ()
213+
214+
215+ def split_four_digit (number : float ):
216+ part1 = str (conditional_int (number ))[:2 ]
217+ part2 = str (conditional_int (number ))[2 :]
218+ return f"{ INFLECT_ENGINE .number_to_words (part1 )} { INFLECT_ENGINE .number_to_words (part2 )} "
219+
220+
221+ def handle_numbers (n : re .Match [str ]) -> str :
222+ number = n .group (2 )
223+
224+ try :
225+ number = float (number )
226+ except :
227+ return n .group ()
228+
229+ if n .group (1 ) == "-" :
230+ number *= - 1
231+
232+ multiplier = translate_multiplier (n .group (3 ))
233+
234+ number = conditional_int (number )
235+ if multiplier != "" :
236+ multiplier = f" { multiplier } "
237+ else :
238+ if (
239+ number % 1 == 0
240+ and len (str (number )) == 4
241+ and number > 1500
242+ and number % 1000 > 9
243+ ):
244+ return split_four_digit (number )
245+
246+ return f"{ INFLECT_ENGINE .number_to_words (number )} { multiplier } "
247+
248+
211249def handle_money (m : re .Match [str ]) -> str :
212250 """Convert money expressions to spoken form"""
213251
214- bill = "dollar" if m .group (2 ) == "$" else "pound"
215- coin = "cent" if m . group ( 2 ) == "$" else "pence"
252+ bill , coin = MONEY_UNITS [ m .group (2 )]
253+
216254 number = m .group (3 )
217255
218- multiplier = m .group (4 )
219256 try :
220257 number = float (number )
221258 except :
@@ -224,12 +261,17 @@ def handle_money(m: re.Match[str]) -> str:
224261 if m .group (1 ) == "-" :
225262 number *= - 1
226263
264+ multiplier = translate_multiplier (m .group (4 ))
265+
266+ if multiplier != "" :
267+ multiplier = f" { multiplier } "
268+
227269 if number % 1 == 0 or multiplier != "" :
228270 text_number = f"{ INFLECT_ENGINE .number_to_words (conditional_int (number ))} { multiplier } { INFLECT_ENGINE .plural (bill , count = number )} "
229271 else :
230272 sub_number = int (str (number ).split ("." )[- 1 ].ljust (2 , "0" ))
231273
232- text_number = f"{ INFLECT_ENGINE .number_to_words (int (round (number )))} { INFLECT_ENGINE .plural (bill , count = number )} and { INFLECT_ENGINE .number_to_words (sub_number )} { INFLECT_ENGINE .plural (coin , count = sub_number )} "
274+ text_number = f"{ INFLECT_ENGINE .number_to_words (int (math . floor (number )))} { INFLECT_ENGINE .plural (bill , count = number )} and { INFLECT_ENGINE .number_to_words (sub_number )} { INFLECT_ENGINE .plural (coin , count = sub_number )} "
233275
234276 return text_number
235277
@@ -320,15 +362,31 @@ def handle_phone_number(p: re.Match[str]) -> str:
320362def handle_time (t : re .Match [str ]) -> str :
321363 t = t .groups ()
322364
323- numbers = " " .join (
324- [INFLECT_ENGINE .number_to_words (X .strip ()) for X in t [0 ].split (":" )]
325- )
365+ time_parts = t [0 ].split (":" )
366+
367+ numbers = []
368+ numbers .append (INFLECT_ENGINE .number_to_words (time_parts [0 ].strip ()))
369+
370+ minute_number = INFLECT_ENGINE .number_to_words (time_parts [1 ].strip ())
371+ if int (time_parts [1 ]) < 10 :
372+ if int (time_parts [1 ]) != 0 :
373+ numbers .append (f"oh { minute_number } " )
374+ else :
375+ numbers .append (minute_number )
326376
327377 half = ""
328- if t [2 ] is not None :
329- half = t [2 ].strip ()
378+ if len (time_parts ) > 2 :
379+ seconds_number = INFLECT_ENGINE .number_to_words (time_parts [2 ].strip ())
380+ second_word = INFLECT_ENGINE .plural ("second" , int (time_parts [2 ].strip ()))
381+ numbers .append (f"and { seconds_number } { second_word } " )
382+ else :
383+ if t [2 ] is not None :
384+ half = " " + t [2 ].strip ()
385+ else :
386+ if int (time_parts [1 ]) == 0 :
387+ numbers .append ("o'clock" )
330388
331- return numbers + half
389+ return " " . join ( numbers ) + half
332390
333391
334392def normalize_text (text : str , normalization_options : NormalizationOptions ) -> str :
@@ -366,7 +424,7 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st
366424 for a , b in zip ("、。!,:;?–" , ",.!,:;?-" ):
367425 text = text .replace (a , b + " " )
368426
369- # Handle simple time in the format of HH:MM:SS
427+ # Handle simple time in the format of HH:MM:SS (am/pm)
370428 text = TIME_PATTERN .sub (
371429 handle_time ,
372430 text ,
@@ -390,15 +448,12 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st
390448 # Handle numbers and money
391449 text = re .sub (r"(?<=\d),(?=\d)" , "" , text )
392450
393- text = re .sub (
394- r"(?i)(-?)([$£])(\d+(?:\.\d+)?)((?: hundred| thousand| (?:[bm]|tr|quadr)illion)*)\b" ,
451+ text = MONEY_PATTERN .sub (
395452 handle_money ,
396453 text ,
397454 )
398455
399- text = re .sub (
400- r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)" , split_num , text
401- )
456+ text = NUMBER_PATTERN .sub (handle_numbers , text )
402457
403458 text = re .sub (r"\d*\.\d+" , handle_decimal , text )
404459
0 commit comments