reverted to pre-seperator algo, added fix for conditional white space after ?!. chars

AlejandroEsquivel · AlejandroEsquivel · commit 3fb576569633 · 2024-10-01T13:11:32.000-07:00
diff --git a/guardrails/utils/tokenization_utils.py b/guardrails/utils/tokenization_utils.py
@@ -8,15 +8,12 @@
 
 
 def replace_til_no_change(input_text, pattern, replacement):
-    while True:
-        new_text = re.sub(pattern, replacement, input_text)
-        if new_text == input_text:
-            break
-        input_text = new_text
+    while re.search(pattern, input_text):
+        input_text = re.sub(pattern, replacement, input_text)
     return input_text
 
 
-def postproc_splits(sentences, separator):
+def postproc_splits(sentences):
     """
     Applies heuristic rules to repair sentence splitting errors.
     Developed for use as postprocessing for the GENIA sentence
@@ -33,83 +30,63 @@ def postproc_splits(sentences, separator):
     Which draws in part on heuristics included in Yoshimasa Tsuruoka's
     medss.pl script.
     """
-
     # Remove Windows line endings
     sentences = sentences.replace("\r", "")
 
     # Breaks sometimes missing after "?", "safe" cases
-    sentences = re.sub(
-        r"\b([a-z]+\?)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
-    )
-    # Breaks sometimes missing after ".", "safe" cases
-    sentences = re.sub(
-        r"\b([a-z]+ \.)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
-    )
+    sentences = re.sub(r"\b([a-z]+\?) ([A-Z][a-z]+)\b", r"\1\n\2", sentences)
+    # Breaks sometimes missing after "." separated with extra space, "safe" cases
+    sentences = re.sub(r"\b([a-z]+ \.) ([A-Z][a-z]+)\b", r"\1\n\2", sentences)
 
     # No breaks producing lines only containing sentence-ending punctuation
-    sentences = re.sub(rf"{separator}([.!?]+){separator}", r"\1" + separator, sentences)
+    sentences = re.sub(r"\n([.!?]+)\n", r"\1\n", sentences)
 
     # No breaks inside parentheses/brackets
+    # Unlimited length for no intervening parentheses/brackets
     sentences = replace_til_no_change(
-        sentences,
-        r"\[([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\]",
-        r"[\1 \2]",
+        sentences, r"\[([^\[\]\(\)]*)\n([^\[\]\(\)]*)\]", r"[\1 \2]"
     )
     sentences = replace_til_no_change(
-        sentences,
-        r"\(([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\)",
-        r"(\1 \2)",
+        sentences, r"\(([^\[\]\(\)]*)\n([^\[\]\(\)]*)\)", r"(\1 \2)"
     )
     # Standard mismatched with possible intervening
     sentences = replace_til_no_change(
-        sentences,
-        r"\[([^\[\]]{0,250})" + re.escape(separator) + r"([^\[\]]{0,250})\]",
-        r"[\1 \2]",
+        sentences, r"\[([^\[\]]{0,250})\n([^\[\]]{0,250})\]", r"[\1 \2]"
     )
     sentences = replace_til_no_change(
-        sentences,
-        r"\(([^\(\)]{0,250})" + re.escape(separator) + r"([^\(\)]{0,250})\)",
-        r"(\1 \2)",
+        sentences, r"\(([^\(\)]{0,250})\n([^\(\)]{0,250})\)", r"(\1 \2)"
     )
 
-    # Line breaks within quotes
+    # Guardrails mods for line breaks within quotes
     sentences = replace_til_no_change(
-        sentences,
-        r'"([^"\n]{0,250})' + re.escape(separator) + r'([^"\n]{0,250})"',
-        r'"\1 \2"',
+        sentences, r'"([^"\n]{0,250})\n([^"\n]{0,250})"', r'"\1 \2"'
     )
     sentences = replace_til_no_change(
-        sentences,
-        r"'([^'\n]{0,250})" + re.escape(separator) + r"([^'\n]{0,250})'",
-        r"'\1 \2'",
+        sentences, r"'([^'\n]{0,250})\n([^'\n]{0,250})'", r"'\1 \2'"
     )
 
     # Nesting to depth one
     sentences = replace_til_no_change(
         sentences,
-        r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})"
-        + re.escape(separator)
-        + r"((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]",
+        r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})\n((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]",
         r"[\1 \2]",
     )
     sentences = replace_til_no_change(
         sentences,
-        r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})"
-        + re.escape(separator)
-        + r"((?:[^\(\)]|\([^\(\)]*\)){0,250})\)",
+        r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})\n((?:[^\(\)]|\([^\(\)]*\)){0,250})\)",
         r"(\1 \2)",
     )
 
     # No break after periods followed by a non-uppercase "normal word"
-    sentences = re.sub(rf"\.{separator}([a-z]{{3,}}[a-z-]*[ .:,])", r". \1", sentences)
+    sentences = re.sub(r"\.\n([a-z]{3}[a-z-]*[ .:,])", r". \1", sentences)
 
     # No break after a single letter other than I
-    sentences = re.sub(rf"(\b[A-HJ-Z]\.){separator}", r"\1 ", sentences)
+    sentences = re.sub(r"(\b[A-HJ-Z]\.)\n", r"\1 ", sentences)
 
     # No break before coordinating conjunctions (CC)
     coordinating_conjunctions = ["and", "or", "but", "nor", "yet"]
     for cc in coordinating_conjunctions:
-        sentences = re.sub(rf"{separator}({cc}\s)", r" \1", sentences)
+        sentences = re.sub(r"\n(" + cc + r" )", r" \1", sentences)
 
     # No break before prepositions (IN)
     prepositions = [
@@ -138,18 +115,18 @@ def postproc_splits(sentences, separator):
         "whether",
     ]
     for prep in prepositions:
-        sentences = re.sub(rf"{separator}({prep}\s)", r" \1", sentences)
+        sentences = re.sub(r"\n(" + prep + r" )", r" \1", sentences)
 
     # No sentence breaks in the middle of specific abbreviations
-    sentences = re.sub(rf"(\be\.){separator}(g\.)", r"\1 \2", sentences)
-    sentences = re.sub(rf"(\bi\.){separator}(e\.)", r"\1 \2", sentences)
-    sentences = re.sub(rf"(\bi\.){separator}(v\.)", r"\1 \2", sentences)
+    sentences = re.sub(r"(\be\.)\n(g\.)", r"\1 \2", sentences)
+    sentences = re.sub(r"(\bi\.)\n(e\.)", r"\1 \2", sentences)
+    sentences = re.sub(r"(\bi\.)\n(v\.)", r"\1 \2", sentences)
 
     # No sentence break after specific abbreviations
     abbreviations = [
-        r"e\.?g\.",
-        r"i\.?e\.",
-        r"i\.?v\.",
+        r"e\. ?g\.",
+        r"i\. ?e\.",
+        r"i\. ?v\.",
         r"vs\.",
         r"cf\.",
         r"Dr\.",
@@ -165,11 +142,12 @@ def postproc_splits(sentences, separator):
         r"Fig\.",
         r"vol\.",
         r"Vols\.",
-        r"No\.",
+        r"no\.",
         r"Nos\.",
         r"et\.",
         r"al\.",
-        r"Inc\.",
+        r"i\. ?v\.",
+        r"inc\.",
         r"Ltd\.",
         r"Co\.",
         r"Corp\.",
@@ -188,18 +166,16 @@ def postproc_splits(sentences, separator):
         r"sen\.",
         r"st\.",
         r"vs\.",
-        r"i\.?e\.",
+        r"i\. ?e\.",
     ]
     for abbr in abbreviations:
-        sentences = re.sub(
-            rf"(\b{abbr}){separator}", r"\1 ", sentences, flags=re.IGNORECASE
-        )
+        sentences = re.sub(r"(\b" + abbr + r")\n", r"\1 ", sentences)
 
     return sentences
 
 
-def split_sentences(text, separator="SENTENCEBREAK"):
-    # Use the separator in the regex
-    text = re.sub(r"([?!.])(?=\s|$)", rf"\1{separator}", text)
-    text = postproc_splits(text, separator)
-    return re.split(rf"\n?{separator} ?\n?", text)
+# Original split sentences function from rulebased_split_sentences
+def split_sentences(text):
+    text = re.sub(r"([?!.])(\s)?", r"\1\n", text)
+    text = postproc_splits(text)
+    return text.split("\n")