guardrails-ai
diff --git a/‎guardrails/utils/tokenization_utils.py‎
Lines changed: 55 additions & 30 deletions b/‎guardrails/utils/tokenization_utils.py‎
Lines changed: 55 additions & 30 deletions
diff --git a/‎guardrails/utils/tokenization_utils_seperator.py‎
Lines changed: 0 additions & 206 deletions b/‎guardrails/utils/tokenization_utils_seperator.py‎
Lines changed: 0 additions & 206 deletions
@@ -8,12 +8,15 @@
 
 
 def replace_til_no_change(input_text, pattern, replacement):
-    while re.search(pattern, input_text):
-        input_text = re.sub(pattern, replacement, input_text)
+    while True:
+        new_text = re.sub(pattern, replacement, input_text)
+        if new_text == input_text:
+            break
+        input_text = new_text
     return input_text
 
 
-def postproc_splits(sentences):
+def postproc_splits(sentences, separator):
     """
     Applies heuristic rules to repair sentence splitting errors.
     Developed for use as postprocessing for the GENIA sentence
@@ -30,63 +33,83 @@ def postproc_splits(sentences):
     Which draws in part on heuristics included in Yoshimasa Tsuruoka's
     medss.pl script.
     """
+
     # Remove Windows line endings
     sentences = sentences.replace("\r", "")
 
     # Breaks sometimes missing after "?", "safe" cases
-    sentences = re.sub(r"\b([a-z]+\?) ([A-Z][a-z]+)\b", r"\1\n\2", sentences)
-    # Breaks sometimes missing after "." separated with extra space, "safe" cases
-    sentences = re.sub(r"\b([a-z]+ \.) ([A-Z][a-z]+)\b", r"\1\n\2", sentences)
+    sentences = re.sub(
+        r"\b([a-z]+\?)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
+    )
+    # Breaks sometimes missing after ".", "safe" cases
+    sentences = re.sub(
+        r"\b([a-z]+ \.)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
+    )
 
     # No breaks producing lines only containing sentence-ending punctuation
-    sentences = re.sub(r"\n([.!?]+)\n", r"\1\n", sentences)
+    sentences = re.sub(rf"{separator}([.!?]+){separator}", r"\1" + separator, sentences)
 
     # No breaks inside parentheses/brackets
-    # Unlimited length for no intervening parentheses/brackets
     sentences = replace_til_no_change(
-        sentences, r"\[([^\[\]\(\)]*)\n([^\[\]\(\)]*)\]", r"[\1 \2]"
+        sentences,
+        r"\[([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\]",
+        r"[\1 \2]",
     )
     sentences = replace_til_no_change(
-        sentences, r"\(([^\[\]\(\)]*)\n([^\[\]\(\)]*)\)", r"(\1 \2)"
+        sentences,
+        r"\(([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\)",
+        r"(\1 \2)",
     )
     # Standard mismatched with possible intervening
     sentences = replace_til_no_change(
-        sentences, r"\[([^\[\]]{0,250})\n([^\[\]]{0,250})\]", r"[\1 \2]"
+        sentences,
+        r"\[([^\[\]]{0,250})" + re.escape(separator) + r"([^\[\]]{0,250})\]",
+        r"[\1 \2]",
     )
     sentences = replace_til_no_change(
-        sentences, r"\(([^\(\)]{0,250})\n([^\(\)]{0,250})\)", r"(\1 \2)"
+        sentences,
+        r"\(([^\(\)]{0,250})" + re.escape(separator) + r"([^\(\)]{0,250})\)",
+        r"(\1 \2)",
     )
 
-    # Guardrails mods for line breaks within quotes
+    # Line breaks within quotes
     sentences = replace_til_no_change(
-        sentences, r'"([^"\n]{0,250})\n([^"\n]{0,250})"', r'"\1 \2"'
+        sentences,
+        r'"([^"\n]{0,250})' + re.escape(separator) + r'([^"\n]{0,250})"',
+        r'"\1 \2"',
     )
     sentences = replace_til_no_change(
-        sentences, r"'([^'\n]{0,250})\n([^'\n]{0,250})'", r"'\1 \2'"
+        sentences,
+        r"'([^'\n]{0,250})" + re.escape(separator) + r"([^'\n]{0,250})'",
+        r"'\1 \2'",
     )
 
     # Nesting to depth one
     sentences = replace_til_no_change(
         sentences,
-        r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})\n((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]",
+        r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})"
+        + re.escape(separator)
+        + r"((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]",
         r"[\1 \2]",
     )
     sentences = replace_til_no_change(
         sentences,
-        r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})\n((?:[^\(\)]|\([^\(\)]*\)){0,250})\)",
+        r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})"
+        + re.escape(separator)
+        + r"((?:[^\(\)]|\([^\(\)]*\)){0,250})\)",
         r"(\1 \2)",
     )
 
     # No break after periods followed by a non-uppercase "normal word"
-    sentences = re.sub(r"\.\n([a-z]{3}[a-z-]*[ .:,])", r". \1", sentences)
+    sentences = re.sub(rf"\.{separator}([a-z]{{3,}}[a-z-]*[ .:,])", r". \1", sentences)
 
     # No break after a single letter other than I
-    sentences = re.sub(r"(\b[A-HJ-Z]\.)\n", r"\1 ", sentences)
+    sentences = re.sub(rf"(\b[A-HJ-Z]\.){separator}", r"\1 ", sentences)
 
     # No break before coordinating conjunctions (CC)
     coordinating_conjunctions = ["and", "or", "but", "nor", "yet"]
     for cc in coordinating_conjunctions:
-        sentences = re.sub(r"\n(" + cc + r" )", r" \1", sentences)
+        sentences = re.sub(rf"{separator}({cc}\s)", r" \1", sentences)
 
     # No break before prepositions (IN)
     prepositions = [
@@ -115,12 +138,12 @@ def postproc_splits(sentences):
         "whether",
     ]
     for prep in prepositions:
-        sentences = re.sub(r"\n(" + prep + r" )", r" \1", sentences)
+        sentences = re.sub(rf"{separator}({prep}\s)", r" \1", sentences)
 
     # No sentence breaks in the middle of specific abbreviations
-    sentences = re.sub(r"(\be\.)\n(g\.)", r"\1 \2", sentences)
-    sentences = re.sub(r"(\bi\.)\n(e\.)", r"\1 \2", sentences)
-    sentences = re.sub(r"(\bi\.)\n(v\.)", r"\1 \2", sentences)
+    sentences = re.sub(rf"(\be\.){separator}(g\.)", r"\1 \2", sentences)
+    sentences = re.sub(rf"(\bi\.){separator}(e\.)", r"\1 \2", sentences)
+    sentences = re.sub(rf"(\bi\.){separator}(v\.)", r"\1 \2", sentences)
 
     # No sentence break after specific abbreviations
     abbreviations = [
@@ -169,13 +192,15 @@ def postproc_splits(sentences):
         r"i\. ?e\.",
     ]
     for abbr in abbreviations:
-        sentences = re.sub(r"(\b" + abbr + r")\n", r"\1 ", sentences)
+        sentences = re.sub(
+            rf"(\b{abbr}){separator}", r"\1", sentences, flags=re.IGNORECASE
+        )
 
     return sentences
 
 
-# Original split sentences function from rulebased_split_sentences
-def split_sentences(text):
-    text = re.sub(r"([?!.])(\s)?", r"\1\n", text)
-    text = postproc_splits(text)
-    return text.split("\n")
+def split_sentences(text, separator="abcdsentenceseperatordcba"):
+    # Use the separator in the regex
+    text = re.sub(r"([?!.])(?=\s|$)", rf"\1{separator}", text)
+    text = postproc_splits(text, separator)
+    return re.split(rf"\n?{separator} ?\n?", text)