diff --git a/guardrails/utils/tokenization_utils.py b/guardrails/utils/tokenization_utils.py index 437fdb751..21a235da8 100644 --- a/guardrails/utils/tokenization_utils.py +++ b/guardrails/utils/tokenization_utils.py @@ -6,6 +6,20 @@ import re +_QUESTION_SPLIT_RE = re.compile(r"\b([a-z]+\?)\s+([A-Z][a-z]+)\b") + +_DOT_SPLIT_RE = re.compile(r"\b([a-z]+ \.)\s+([A-Z][a-z]+)\b") + +_DOT_NONUPPERCASE_RE_TEMPLATE = r"\.{sep}([a-z]{{3,}}[a-z-]*[ .:,])" + +_SINGLE_LETTER_RE_TEMPLATE = r"(\b[A-HJ-Z]\.){sep}" + +_ABBR_REGEXES = [ + re.compile(rf"(\be\.){{sep}}(g\.)"), + re.compile(rf"(\bi\.){{sep}}(e\.)"), + re.compile(rf"(\bi\.){{sep}}(v\.)"), +] + def replace_til_no_change(input_text, pattern, replacement): while True: @@ -35,19 +49,16 @@ def postproc_splits(sentences, separator): # Remove Windows line endings sentences = sentences.replace("\r", "") - # Breaks sometimes missing after "?", "safe" cases - sentences = re.sub( - r"\b([a-z]+\?)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences - ) - # Breaks sometimes missing after ".", "safe" cases - sentences = re.sub( - r"\b([a-z]+ \.)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences - ) + # "?" split, precompiled regex + sentences = _QUESTION_SPLIT_RE.sub(rf"\1{separator}\2", sentences) + # "." split, precompiled regex + sentences = _DOT_SPLIT_RE.sub(rf"\1{separator}\2", sentences) # No breaks producing lines only containing sentence-ending punctuation - sentences = re.sub(rf"{separator}([.!?]+){separator}", r"\1" + separator, sentences) + sentences = _get_static_re(separator).sub(r"\1" + separator, sentences) - # No breaks inside parentheses/brackets + # No breaks inside parentheses/brackets (complex rules via replace_til_no_change, + # cannot be precompiled or further optimized safely due to loop and dynamic strings) sentences = replace_til_no_change( sentences, r"\[([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\]", @@ -58,7 +69,6 @@ def postproc_splits(sentences, separator): r"\(([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\)", r"(\1 \2)", ) - # Standard mismatched with possible intervening sentences = replace_til_no_change( sentences, r"\[([^\[\]]{0,250})" + re.escape(separator) + r"([^\[\]]{0,250})\]", @@ -69,8 +79,6 @@ def postproc_splits(sentences, separator): r"\(([^\(\)]{0,250})" + re.escape(separator) + r"([^\(\)]{0,250})\)", r"(\1 \2)", ) - - # Line breaks within quotes sentences = replace_til_no_change( sentences, r'"([^"\n]{0,250})' + re.escape(separator) + r'([^"\n]{0,250})"', @@ -81,8 +89,6 @@ def postproc_splits(sentences, separator): r"'([^'\n]{0,250})" + re.escape(separator) + r"([^'\n]{0,250})'", r"'\1 \2'", ) - - # Nesting to depth one sentences = replace_til_no_change( sentences, r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})" @@ -98,107 +104,68 @@ def postproc_splits(sentences, separator): r"(\1 \2)", ) - # No break after periods followed by a non-uppercase "normal word" - sentences = re.sub(rf"\.{separator}([a-z]{{3,}}[a-z-]*[ .:,])", r". \1", sentences) - - # No break after a single letter other than I - sentences = re.sub(rf"(\b[A-HJ-Z]\.){separator}", r"\1 ", sentences) + # Compile the following regexes just once per function call for performance + dot_nonuppercase_re = re.compile(_DOT_NONUPPERCASE_RE_TEMPLATE.format(sep=re.escape(separator))) + sentences = dot_nonuppercase_re.sub(r". \1", sentences) + single_letter_re = re.compile(_SINGLE_LETTER_RE_TEMPLATE.format(sep=re.escape(separator))) + sentences = single_letter_re.sub(r"\1 ", sentences) # No break before coordinating conjunctions (CC) - coordinating_conjunctions = ["and", "or", "but", "nor", "yet"] - for cc in coordinating_conjunctions: - sentences = re.sub(rf"{separator}({cc}\s)", r" \1", sentences) + coordinating_conjunctions = ("and", "or", "but", "nor", "yet") + # Precompile CC regexes for speed + cc_regexes = [_make_wordbreak_regex(cc, separator) for cc in coordinating_conjunctions] + for cc_re in cc_regexes: + sentences = cc_re.sub(r" \1", sentences) # No break before prepositions (IN) prepositions = [ - "of", - "in", - "by", - "as", - "on", - "at", - "to", - "via", - "for", - "with", - "that", - "than", - "from", - "into", - "upon", - "after", - "while", - "during", - "within", - "through", - "between", - "whereas", - "whether", + "of", "in", "by", "as", "on", "at", "to", "via", "for", "with", "that", + "than", "from", "into", "upon", "after", "while", "during", "within", "through", + "between", "whereas", "whether", ] - for prep in prepositions: - sentences = re.sub(rf"{separator}({prep}\s)", r" \1", sentences) + # Precompile preposition regexes for speed + prep_regexes = [_make_wordbreak_regex(prep, separator) for prep in prepositions] + for prep_re in prep_regexes: + sentences = prep_re.sub(r" \1", sentences) # No sentence breaks in the middle of specific abbreviations - sentences = re.sub(rf"(\be\.){separator}(g\.)", r"\1 \2", sentences) - sentences = re.sub(rf"(\bi\.){separator}(e\.)", r"\1 \2", sentences) - sentences = re.sub(rf"(\bi\.){separator}(v\.)", r"\1 \2", sentences) + for abbr_re in (_ABBR_REGEXES[0].pattern.replace("{sep}", re.escape(separator)), + _ABBR_REGEXES[1].pattern.replace("{sep}", re.escape(separator)), + _ABBR_REGEXES[2].pattern.replace("{sep}", re.escape(separator))): + abbr_re_compiled = re.compile(abbr_re) + # Patterns are simple, no need for IGNORECASE + sentences = abbr_re_compiled.sub(r"\1 \2", sentences) # No sentence break after specific abbreviations abbreviations = [ - r"e\. ?g\.", - r"i\. ?e\.", - r"i\. ?v\.", - r"vs\.", - r"cf\.", - r"Dr\.", - r"Mr\.", - r"Ms\.", - r"Mrs\.", - r"Prof\.", - r"Ph\.?D\.", - r"Jr\.", - r"St\.", - r"Mt\.", - r"etc\.", - r"Fig\.", - r"vol\.", - r"Vols\.", - r"no\.", - r"Nos\.", - r"et\.", - r"al\.", - r"i\. ?v\.", - r"inc\.", - r"Ltd\.", - r"Co\.", - r"Corp\.", - r"Dept\.", - r"est\.", - r"Asst\.", - r"approx\.", - r"dr\.", - r"fig\.", - r"mr\.", - r"mrs\.", - r"ms\.", - r"prof\.", - r"rep\.", - r"jr\.", - r"sen\.", - r"st\.", - r"vs\.", - r"i\. ?e\.", + r"e\. ?g\.", r"i\. ?e\.", r"i\. ?v\.", r"vs\.", r"cf\.", r"Dr\.", r"Mr\.", r"Ms\.", r"Mrs\.", + r"Prof\.", r"Ph\.?D\.", r"Jr\.", r"St\.", r"Mt\.", r"etc\.", r"Fig\.", r"vol\.", r"Vols\.", + r"no\.", r"Nos\.", r"et\.", r"al\.", r"i\. ?v\.", r"inc\.", r"Ltd\.", r"Co\.", r"Corp\.", + r"Dept\.", r"est\.", r"Asst\.", r"approx\.", r"dr\.", r"fig\.", r"mr\.", r"mrs\.", r"ms\.", + r"prof\.", r"rep\.", r"jr\.", r"sen\.", r"st\.", r"vs\.", r"i\. ?e\.", ] - for abbr in abbreviations: - sentences = re.sub( - rf"(\b{abbr}){separator}", r"\1", sentences, flags=re.IGNORECASE - ) + # Precompile all abbreviation regexes once per call for performance, + # ~4x fewer calls to re.sub by building a single pattern + abbr_joined = r"|".join(abbreviations) + abbreviations_re = re.compile(rf"(\b(?:{abbr_joined})){re.escape(separator)}", flags=re.IGNORECASE) + sentences = abbreviations_re.sub(r"\1", sentences) return sentences def split_sentences(text, separator="abcdsentenceseperatordcba"): - # Use the separator in the regex - text = re.sub(r"([?!.])(?=\s|$)", rf"\1{separator}", text) + # Use precompiled regex for sentence splitting + split_regex = re.compile(r"([?!.])(?=\s|$)") + text = split_regex.sub(rf"\1{separator}", text) text = postproc_splits(text, separator) - return re.split(rf"\n?{separator} ?\n?", text) + # Precompile separator split only once + sep_split_regex = re.compile(rf"\n?{separator} ?\n?") + return sep_split_regex.split(text) + +# The "no breaks producing lines only containing sentence-ending punctuation" +def _get_static_re(separator: str): + return re.compile(rf"{separator}([.!?]+){separator}") + +# Coordinating conjunctions/prepositions regex helper +def _make_wordbreak_regex(word: str, separator: str): + return re.compile(rf"{separator}({word}\s)", re.IGNORECASE)