testing changes using custom separator in wordtokenizer algo

AlejandroEsquivel · AlejandroEsquivel · commit bd0ab50d2828 · 2024-10-01T12:10:50.000-07:00
diff --git a/guardrails/utils/tokenization_utils.py b/guardrails/utils/tokenization_utils.py
@@ -8,12 +8,15 @@
 
 
 def replace_til_no_change(input_text, pattern, replacement):
-    while re.search(pattern, input_text):
-        input_text = re.sub(pattern, replacement, input_text)
+    while True:
+        new_text = re.sub(pattern, replacement, input_text)
+        if new_text == input_text:
+            break
+        input_text = new_text
     return input_text
 
 
-def postproc_splits(sentences):
+def postproc_splits(sentences, separator):
     """
     Applies heuristic rules to repair sentence splitting errors.
     Developed for use as postprocessing for the GENIA sentence
@@ -30,63 +33,83 @@ def postproc_splits(sentences):
     Which draws in part on heuristics included in Yoshimasa Tsuruoka's
     medss.pl script.
     """
+
     # Remove Windows line endings
     sentences = sentences.replace("\r", "")
 
     # Breaks sometimes missing after "?", "safe" cases
-    sentences = re.sub(r"\b([a-z]+\?) ([A-Z][a-z]+)\b", r"\1\n\2", sentences)
-    # Breaks sometimes missing after "." separated with extra space, "safe" cases
-    sentences = re.sub(r"\b([a-z]+ \.) ([A-Z][a-z]+)\b", r"\1\n\2", sentences)
+    sentences = re.sub(
+        r"\b([a-z]+\?)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
+    )
+    # Breaks sometimes missing after ".", "safe" cases
+    sentences = re.sub(
+        r"\b([a-z]+ \.)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
+    )
 
     # No breaks producing lines only containing sentence-ending punctuation
-    sentences = re.sub(r"\n([.!?]+)\n", r"\1\n", sentences)
+    sentences = re.sub(rf"{separator}([.!?]+){separator}", r"\1" + separator, sentences)
 
     # No breaks inside parentheses/brackets
-    # Unlimited length for no intervening parentheses/brackets
     sentences = replace_til_no_change(
-        sentences, r"\[([^\[\]\(\)]*)\n([^\[\]\(\)]*)\]", r"[\1 \2]"
+        sentences,
+        r"\[([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\]",
+        r"[\1 \2]",
     )
     sentences = replace_til_no_change(
-        sentences, r"\(([^\[\]\(\)]*)\n([^\[\]\(\)]*)\)", r"(\1 \2)"
+        sentences,
+        r"\(([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\)",
+        r"(\1 \2)",
     )
     # Standard mismatched with possible intervening
     sentences = replace_til_no_change(
-        sentences, r"\[([^\[\]]{0,250})\n([^\[\]]{0,250})\]", r"[\1 \2]"
+        sentences,
+        r"\[([^\[\]]{0,250})" + re.escape(separator) + r"([^\[\]]{0,250})\]",
+        r"[\1 \2]",
     )
     sentences = replace_til_no_change(
-        sentences, r"\(([^\(\)]{0,250})\n([^\(\)]{0,250})\)", r"(\1 \2)"
+        sentences,
+        r"\(([^\(\)]{0,250})" + re.escape(separator) + r"([^\(\)]{0,250})\)",
+        r"(\1 \2)",
     )
 
-    # Guardrails mods for line breaks within quotes
+    # Line breaks within quotes
     sentences = replace_til_no_change(
-        sentences, r'"([^"\n]{0,250})\n([^"\n]{0,250})"', r'"\1 \2"'
+        sentences,
+        r'"([^"\n]{0,250})' + re.escape(separator) + r'([^"\n]{0,250})"',
+        r'"\1 \2"',
     )
     sentences = replace_til_no_change(
-        sentences, r"'([^'\n]{0,250})\n([^'\n]{0,250})'", r"'\1 \2'"
+        sentences,
+        r"'([^'\n]{0,250})" + re.escape(separator) + r"([^'\n]{0,250})'",
+        r"'\1 \2'",
     )
 
     # Nesting to depth one
     sentences = replace_til_no_change(
         sentences,
-        r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})\n((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]",
+        r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})"
+        + re.escape(separator)
+        + r"((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]",
         r"[\1 \2]",
     )
     sentences = replace_til_no_change(
         sentences,
-        r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})\n((?:[^\(\)]|\([^\(\)]*\)){0,250})\)",
+        r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})"
+        + re.escape(separator)
+        + r"((?:[^\(\)]|\([^\(\)]*\)){0,250})\)",
         r"(\1 \2)",
     )
 
     # No break after periods followed by a non-uppercase "normal word"
-    sentences = re.sub(r"\.\n([a-z]{3}[a-z-]*[ .:,])", r". \1", sentences)
+    sentences = re.sub(rf"\.{separator}([a-z]{{3,}}[a-z-]*[ .:,])", r". \1", sentences)
 
     # No break after a single letter other than I
-    sentences = re.sub(r"(\b[A-HJ-Z]\.)\n", r"\1 ", sentences)
+    sentences = re.sub(rf"(\b[A-HJ-Z]\.){separator}", r"\1 ", sentences)
 
     # No break before coordinating conjunctions (CC)
     coordinating_conjunctions = ["and", "or", "but", "nor", "yet"]
     for cc in coordinating_conjunctions:
-        sentences = re.sub(r"\n(" + cc + r" )", r" \1", sentences)
+        sentences = re.sub(rf"{separator}({cc}\s)", r" \1", sentences)
 
     # No break before prepositions (IN)
     prepositions = [
@@ -115,18 +138,18 @@ def postproc_splits(sentences):
         "whether",
     ]
     for prep in prepositions:
-        sentences = re.sub(r"\n(" + prep + r" )", r" \1", sentences)
+        sentences = re.sub(rf"{separator}({prep}\s)", r" \1", sentences)
 
     # No sentence breaks in the middle of specific abbreviations
-    sentences = re.sub(r"(\be\.)\n(g\.)", r"\1 \2", sentences)
-    sentences = re.sub(r"(\bi\.)\n(e\.)", r"\1 \2", sentences)
-    sentences = re.sub(r"(\bi\.)\n(v\.)", r"\1 \2", sentences)
+    sentences = re.sub(rf"(\be\.){separator}(g\.)", r"\1 \2", sentences)
+    sentences = re.sub(rf"(\bi\.){separator}(e\.)", r"\1 \2", sentences)
+    sentences = re.sub(rf"(\bi\.){separator}(v\.)", r"\1 \2", sentences)
 
     # No sentence break after specific abbreviations
     abbreviations = [
-        r"e\. ?g\.",
-        r"i\. ?e\.",
-        r"i\. ?v\.",
+        r"e\.?g\.",
+        r"i\.?e\.",
+        r"i\.?v\.",
         r"vs\.",
         r"cf\.",
         r"Dr\.",
@@ -142,12 +165,11 @@ def postproc_splits(sentences):
         r"Fig\.",
         r"vol\.",
         r"Vols\.",
-        r"no\.",
+        r"No\.",
         r"Nos\.",
         r"et\.",
         r"al\.",
-        r"i\. ?v\.",
-        r"inc\.",
+        r"Inc\.",
         r"Ltd\.",
         r"Co\.",
         r"Corp\.",
@@ -166,16 +188,18 @@ def postproc_splits(sentences):
         r"sen\.",
         r"st\.",
         r"vs\.",
-        r"i\. ?e\.",
+        r"i\.?e\.",
     ]
     for abbr in abbreviations:
-        sentences = re.sub(r"(\b" + abbr + r")\n", r"\1 ", sentences)
+        sentences = re.sub(
+            rf"(\b{abbr}){separator}", r"\1 ", sentences, flags=re.IGNORECASE
+        )
 
     return sentences
 
 
-# Original split sentences function from rulebased_split_sentences
-def split_sentences(text):
-    text = re.sub(r"([?!.])\s", r"\1\n", text)
-    text = postproc_splits(text)
-    return text.split("\n")
+def split_sentences(text, separator="SENTENCEBREAK"):
+    # Use the separator in the regex
+    text = re.sub(r"([?!.])(?=\s|$)", rf"\1{separator}", text)
+    text = postproc_splits(text, separator)
+    return re.split(rf"\n?{separator} ?\n?", text)
diff --git a/guardrails/validator_base.py b/guardrails/validator_base.py
@@ -44,7 +44,9 @@ def split_sentence_str(chunk: str):
     return [fragments[0] + ".", ".".join(fragments[1:])]
 
 
-def split_sentence_word_tokenizers_jl(chunk: str):
+def split_sentence_word_tokenizers_jl(
+    chunk: str, separator: str = "SENTENCEBREAK"
+) -> List[str]:
     """
     Use a sentence tokenizer to detect if at least one sentence is present in the chunk.
     We return the first sentence and the remaining chunks without the first sentence.
@@ -69,12 +71,16 @@ def split_sentence_word_tokenizers_jl(chunk: str):
         is_minimum_length = True
 
     # check for potential line endings, which is what split_sentences does
-    chunk_with_potential_line_endings, count = re.subn(r"([?!.])\s", r"\1\n", chunk)
-    any_potential_line_endings = count > 0
+    chunk_with_potential_line_endings, count = re.sub(
+        r"([?!.])(?=\s|$)", rf"\1{separator}", chunk
+    )
+    any_potential_line_endings = int(count) > 0
     if not is_minimum_length or not any_potential_line_endings:
         return []
 
-    sentences = postproc_splits(chunk_with_potential_line_endings).split("\n")
+    sentences = postproc_splits(chunk_with_potential_line_endings, separator).split(
+        "\n"
+    )
     # if not more than one sentence, we haven't accumulated enough for a validation
     if len(sentences) <= 1:
         return []