added back modified seperator algo, fix for split sentence

AlejandroEsquivel · AlejandroEsquivel · commit 6462558c0921 · 2024-10-01T13:52:14.000-07:00
diff --git a/guardrails/utils/tokenization_utils_seperator.py b/guardrails/utils/tokenization_utils_seperator.py
@@ -0,0 +1,205 @@
+# This file contains code adapted from the WordTokenizers.jl
+# https://github.com/JuliaText/WordTokenizers.jl project.
+# It is subject to the license terms in the Apache License file
+# found in the top-level directory of this distribution.
+# This file has been modified by Guardrails AI on September 27 2024.
+
+import re
+
+
+def replace_til_no_change(input_text, pattern, replacement):
+    while True:
+        new_text = re.sub(pattern, replacement, input_text)
+        if new_text == input_text:
+            break
+        input_text = new_text
+    return input_text
+
+
+def postproc_splits(sentences, separator):
+    """
+    Applies heuristic rules to repair sentence splitting errors.
+    Developed for use as postprocessing for the GENIA sentence
+    splitter on PubMed abstracts, with minor tweaks for
+    full-text documents.
+
+    `sentences` should be a string, with line breaks on sentence boundaries.
+    Returns a similar string, but more correct.
+
+    Based on
+    https://github.com/ninjin/geniass/blob/master/geniass-postproc.pl
+    Which is
+    (c) 2010 Sampo Pyysalo. No rights reserved, i.e. do whatever you like with this.
+    Which draws in part on heuristics included in Yoshimasa Tsuruoka's
+    medss.pl script.
+    """
+
+    # Remove Windows line endings
+    sentences = sentences.replace("\r", "")
+
+    # Breaks sometimes missing after "?", "safe" cases
+    sentences = re.sub(
+        r"\b([a-z]+\?)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
+    )
+    # Breaks sometimes missing after ".", "safe" cases
+    sentences = re.sub(
+        r"\b([a-z]+ \.)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
+    )
+
+    # No breaks producing lines only containing sentence-ending punctuation
+    sentences = re.sub(rf"{separator}([.!?]+){separator}", r"\1" + separator, sentences)
+
+    # No breaks inside parentheses/brackets
+    sentences = replace_til_no_change(
+        sentences,
+        r"\[([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\]",
+        r"[\1 \2]",
+    )
+    sentences = replace_til_no_change(
+        sentences,
+        r"\(([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\)",
+        r"(\1 \2)",
+    )
+    # Standard mismatched with possible intervening
+    sentences = replace_til_no_change(
+        sentences,
+        r"\[([^\[\]]{0,250})" + re.escape(separator) + r"([^\[\]]{0,250})\]",
+        r"[\1 \2]",
+    )
+    sentences = replace_til_no_change(
+        sentences,
+        r"\(([^\(\)]{0,250})" + re.escape(separator) + r"([^\(\)]{0,250})\)",
+        r"(\1 \2)",
+    )
+
+    # Line breaks within quotes
+    sentences = replace_til_no_change(
+        sentences,
+        r'"([^"\n]{0,250})' + re.escape(separator) + r'([^"\n]{0,250})"',
+        r'"\1 \2"',
+    )
+    sentences = replace_til_no_change(
+        sentences,
+        r"'([^'\n]{0,250})" + re.escape(separator) + r"([^'\n]{0,250})'",
+        r"'\1 \2'",
+    )
+
+    # Nesting to depth one
+    sentences = replace_til_no_change(
+        sentences,
+        r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})"
+        + re.escape(separator)
+        + r"((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]",
+        r"[\1 \2]",
+    )
+    sentences = replace_til_no_change(
+        sentences,
+        r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})"
+        + re.escape(separator)
+        + r"((?:[^\(\)]|\([^\(\)]*\)){0,250})\)",
+        r"(\1 \2)",
+    )
+
+    # No break after periods followed by a non-uppercase "normal word"
+    sentences = re.sub(rf"\.{separator}([a-z]{{3,}}[a-z-]*[ .:,])", r". \1", sentences)
+
+    # No break after a single letter other than I
+    sentences = re.sub(rf"(\b[A-HJ-Z]\.){separator}", r"\1 ", sentences)
+
+    # No break before coordinating conjunctions (CC)
+    coordinating_conjunctions = ["and", "or", "but", "nor", "yet"]
+    for cc in coordinating_conjunctions:
+        sentences = re.sub(rf"{separator}({cc}\s)", r" \1", sentences)
+
+    # No break before prepositions (IN)
+    prepositions = [
+        "of",
+        "in",
+        "by",
+        "as",
+        "on",
+        "at",
+        "to",
+        "via",
+        "for",
+        "with",
+        "that",
+        "than",
+        "from",
+        "into",
+        "upon",
+        "after",
+        "while",
+        "during",
+        "within",
+        "through",
+        "between",
+        "whereas",
+        "whether",
+    ]
+    for prep in prepositions:
+        sentences = re.sub(rf"{separator}({prep}\s)", r" \1", sentences)
+
+    # No sentence breaks in the middle of specific abbreviations
+    sentences = re.sub(rf"(\be\.){separator}(g\.)", r"\1 \2", sentences)
+    sentences = re.sub(rf"(\bi\.){separator}(e\.)", r"\1 \2", sentences)
+    sentences = re.sub(rf"(\bi\.){separator}(v\.)", r"\1 \2", sentences)
+
+    # No sentence break after specific abbreviations
+    abbreviations = [
+        r"e\.?g\.",
+        r"i\.?e\.",
+        r"i\.?v\.",
+        r"vs\.",
+        r"cf\.",
+        r"Dr\.",
+        r"Mr\.",
+        r"Ms\.",
+        r"Mrs\.",
+        r"Prof\.",
+        r"Ph\.?D\.",
+        r"Jr\.",
+        r"St\.",
+        r"Mt\.",
+        r"etc\.",
+        r"Fig\.",
+        r"vol\.",
+        r"Vols\.",
+        r"No\.",
+        r"Nos\.",
+        r"et\.",
+        r"al\.",
+        r"Inc\.",
+        r"Ltd\.",
+        r"Co\.",
+        r"Corp\.",
+        r"Dept\.",
+        r"est\.",
+        r"Asst\.",
+        r"approx\.",
+        r"dr\.",
+        r"fig\.",
+        r"mr\.",
+        r"mrs\.",
+        r"ms\.",
+        r"prof\.",
+        r"rep\.",
+        r"jr\.",
+        r"sen\.",
+        r"st\.",
+        r"vs\.",
+        r"i\.?e\.",
+    ]
+    for abbr in abbreviations:
+        sentences = re.sub(
+            rf"(\b{abbr}){separator}", r"\1", sentences, flags=re.IGNORECASE
+        )
+
+    return sentences
+
+
+def split_sentences(text, separator="abcdsentenceseperatordcba"):
+    # Use the separator in the regex
+    text = re.sub(r"([?!.])(?=\s|$)", rf"\1{separator}", text)
+    text = postproc_splits(text, separator)
+    return re.split(rf"\n?{separator} ?\n?", text)
diff --git a/guardrails/validator_base.py b/guardrails/validator_base.py
@@ -33,6 +33,9 @@
 from guardrails.utils.safe_get import safe_get
 from guardrails.utils.hub_telemetry_utils import HubTelemetry
 from guardrails.utils.tokenization_utils import postproc_splits
+from guardrails.utils.tokenization_utils_seperator import (
+    postproc_splits as postproc_splits_separator,
+)
 
 
 ### functions to get chunks ###
@@ -44,6 +47,51 @@ def split_sentence_str(chunk: str):
     return [fragments[0] + ".", ".".join(fragments[1:])]
 
 
+def split_sentence_word_tokenizers_jl_separator(
+    chunk: str, separator: str = "abcdsentenceseperatordcba"
+):
+    """
+    Use a sentence tokenizer to detect if at least one sentence is present in the chunk.
+    We return the first sentence and the remaining chunks without the first sentence.
+
+    We perform the first step of WordTokenizers.jl's split_sentences function to
+    detect possible sentence boundaries before calling the sentence tokenizer.
+
+    Args:
+        chunk (str): The text to split into sentences.
+
+    Returns:
+        List[str]: A list of two strings. The first string is the first sentence
+            in the chunk. The second string is the remaining text in the chunk.
+    """
+    # using the sentence tokenizer is expensive
+    # we check for a . to avoid wastefully calling the tokenizer
+
+    # check at least 3 characters have been accumulated before splitting
+    is_minimum_length = False
+    with contextlib.suppress(IndexError):
+        chunk[2]
+        is_minimum_length = True
+
+    # check for potential line endings, which is what split_sentences does
+    chunk_with_potential_line_endings, count = re.subn(
+        r"([?!.])\s?", rf"\1{separator}", chunk
+    )
+    any_potential_line_endings = count > 0
+    if not is_minimum_length or not any_potential_line_endings:
+        return []
+
+    sentences = postproc_splits_separator(chunk_with_potential_line_endings, separator)
+    sentences = re.split(rf"\n?{separator} ?\n?", sentences)
+    # if not more than one sentence, we haven't accumulated enough for a validation
+    if len(sentences) <= 1:
+        return []
+
+    # return the sentence
+    # then the remaining chunks that aren't finished accumulating
+    return [sentences[0], "".join(sentences[1:])]
+
+
 def split_sentence_word_tokenizers_jl(chunk: str):
     """
     Use a sentence tokenizer to detect if at least one sentence is present in the chunk.
@@ -69,7 +117,9 @@ def split_sentence_word_tokenizers_jl(chunk: str):
         is_minimum_length = True
 
     # check for potential line endings, which is what split_sentences does
-    chunk_with_potential_line_endings, count = re.subn(r"([?!.])(\s)?", r"\1\n", chunk)
+    chunk_with_potential_line_endings, count = re.subn(
+        r"([?!.])(?=\s|$)", r"\1\n", chunk
+    )
     any_potential_line_endings = count > 0
     if not is_minimum_length or not any_potential_line_endings:
         return []
@@ -303,7 +353,7 @@ def _chunking_function(self, chunk: str) -> List[str]:
         Returns:
             list[str]: The text chunked into some subset.
         """
-        return split_sentence_word_tokenizers_jl(chunk)
+        return split_sentence_word_tokenizers_jl_separator(chunk)
 
     def validate_stream(
         self, chunk: Any, metadata: Dict[str, Any], **kwargs