|  | 
|  | 1 | +# This file contains code adapted from the WordTokenizers.jl | 
|  | 2 | +# https://github.com/JuliaText/WordTokenizers.jl project. | 
|  | 3 | +# It is subject to the license terms in the Apache License file | 
|  | 4 | +# found in the top-level directory of this distribution. | 
|  | 5 | +# This file has been modified by Guardrails AI on September 27 2024. | 
|  | 6 | + | 
|  | 7 | +import re | 
|  | 8 | + | 
|  | 9 | + | 
|  | 10 | +def replace_til_no_change(input_text, pattern, replacement): | 
|  | 11 | +    while re.search(pattern, input_text): | 
|  | 12 | +        input_text = re.sub(pattern, replacement, input_text) | 
|  | 13 | +    return input_text | 
|  | 14 | + | 
|  | 15 | + | 
|  | 16 | +def postproc_splits(sentences): | 
|  | 17 | +    """ | 
|  | 18 | +    Applies heuristic rules to repair sentence splitting errors. | 
|  | 19 | +    Developed for use as postprocessing for the GENIA sentence | 
|  | 20 | +    splitter on PubMed abstracts, with minor tweaks for | 
|  | 21 | +    full-text documents. | 
|  | 22 | +
 | 
|  | 23 | +    `sentences` should be a string, with line breaks on sentence boundaries. | 
|  | 24 | +    Returns a similar string, but more correct. | 
|  | 25 | +
 | 
|  | 26 | +    Based on | 
|  | 27 | +    https://github.com/ninjin/geniass/blob/master/geniass-postproc.pl | 
|  | 28 | +    Which is | 
|  | 29 | +    (c) 2010 Sampo Pyysalo. No rights reserved, i.e. do whatever you like with this. | 
|  | 30 | +    Which draws in part on heuristics included in Yoshimasa Tsuruoka's | 
|  | 31 | +    medss.pl script. | 
|  | 32 | +    """ | 
|  | 33 | +    # Remove Windows line endings | 
|  | 34 | +    sentences = sentences.replace("\r", "") | 
|  | 35 | + | 
|  | 36 | +    # Breaks sometimes missing after "?", "safe" cases | 
|  | 37 | +    sentences = re.sub(r"\b([a-z]+\?) ([A-Z][a-z]+)\b", r"\1\n\2", sentences) | 
|  | 38 | +    # Breaks sometimes missing after "." separated with extra space, "safe" cases | 
|  | 39 | +    sentences = re.sub(r"\b([a-z]+ \.) ([A-Z][a-z]+)\b", r"\1\n\2", sentences) | 
|  | 40 | + | 
|  | 41 | +    # No breaks producing lines only containing sentence-ending punctuation | 
|  | 42 | +    sentences = re.sub(r"\n([.!?]+)\n", r"\1\n", sentences) | 
|  | 43 | + | 
|  | 44 | +    # No breaks inside parentheses/brackets | 
|  | 45 | +    # Unlimited length for no intervening parentheses/brackets | 
|  | 46 | +    sentences = replace_til_no_change( | 
|  | 47 | +        sentences, r"\[([^\[\]\(\)]*)\n([^\[\]\(\)]*)\]", r"[\1 \2]" | 
|  | 48 | +    ) | 
|  | 49 | +    sentences = replace_til_no_change( | 
|  | 50 | +        sentences, r"\(([^\[\]\(\)]*)\n([^\[\]\(\)]*)\)", r"(\1 \2)" | 
|  | 51 | +    ) | 
|  | 52 | +    # Standard mismatched with possible intervening | 
|  | 53 | +    sentences = replace_til_no_change( | 
|  | 54 | +        sentences, r"\[([^\[\]]{0,250})\n([^\[\]]{0,250})\]", r"[\1 \2]" | 
|  | 55 | +    ) | 
|  | 56 | +    sentences = replace_til_no_change( | 
|  | 57 | +        sentences, r"\(([^\(\)]{0,250})\n([^\(\)]{0,250})\)", r"(\1 \2)" | 
|  | 58 | +    ) | 
|  | 59 | + | 
|  | 60 | +    # Nesting to depth one | 
|  | 61 | +    sentences = replace_til_no_change( | 
|  | 62 | +        sentences, | 
|  | 63 | +        r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})\n((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]", | 
|  | 64 | +        r"[\1 \2]", | 
|  | 65 | +    ) | 
|  | 66 | +    sentences = replace_til_no_change( | 
|  | 67 | +        sentences, | 
|  | 68 | +        r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})\n((?:[^\(\)]|\([^\(\)]*\)){0,250})\)", | 
|  | 69 | +        r"(\1 \2)", | 
|  | 70 | +    ) | 
|  | 71 | + | 
|  | 72 | +    # No break after periods followed by a non-uppercase "normal word" | 
|  | 73 | +    sentences = re.sub(r"\.\n([a-z]{3}[a-z-]*[ .:,])", r". \1", sentences) | 
|  | 74 | + | 
|  | 75 | +    # No break after a single letter other than I | 
|  | 76 | +    sentences = re.sub(r"(\b[A-HJ-Z]\.)\n", r"\1 ", sentences) | 
|  | 77 | + | 
|  | 78 | +    # No break before coordinating conjunctions (CC) | 
|  | 79 | +    coordinating_conjunctions = ["and", "or", "but", "nor", "yet"] | 
|  | 80 | +    for cc in coordinating_conjunctions: | 
|  | 81 | +        sentences = re.sub(r"\n(" + cc + r" )", r" \1", sentences) | 
|  | 82 | + | 
|  | 83 | +    # No break before prepositions (IN) | 
|  | 84 | +    prepositions = [ | 
|  | 85 | +        "of", | 
|  | 86 | +        "in", | 
|  | 87 | +        "by", | 
|  | 88 | +        "as", | 
|  | 89 | +        "on", | 
|  | 90 | +        "at", | 
|  | 91 | +        "to", | 
|  | 92 | +        "via", | 
|  | 93 | +        "for", | 
|  | 94 | +        "with", | 
|  | 95 | +        "that", | 
|  | 96 | +        "than", | 
|  | 97 | +        "from", | 
|  | 98 | +        "into", | 
|  | 99 | +        "upon", | 
|  | 100 | +        "after", | 
|  | 101 | +        "while", | 
|  | 102 | +        "during", | 
|  | 103 | +        "within", | 
|  | 104 | +        "through", | 
|  | 105 | +        "between", | 
|  | 106 | +        "whereas", | 
|  | 107 | +        "whether", | 
|  | 108 | +    ] | 
|  | 109 | +    for prep in prepositions: | 
|  | 110 | +        sentences = re.sub(r"\n(" + prep + r" )", r" \1", sentences) | 
|  | 111 | + | 
|  | 112 | +    # No sentence breaks in the middle of specific abbreviations | 
|  | 113 | +    sentences = re.sub(r"(\be\.)\n(g\.)", r"\1 \2", sentences) | 
|  | 114 | +    sentences = re.sub(r"(\bi\.)\n(e\.)", r"\1 \2", sentences) | 
|  | 115 | +    sentences = re.sub(r"(\bi\.)\n(v\.)", r"\1 \2", sentences) | 
|  | 116 | + | 
|  | 117 | +    # No sentence break after specific abbreviations | 
|  | 118 | +    abbreviations = [ | 
|  | 119 | +        r"e\. ?g\.", | 
|  | 120 | +        r"i\. ?e\.", | 
|  | 121 | +        r"i\. ?v\.", | 
|  | 122 | +        r"vs\.", | 
|  | 123 | +        r"cf\.", | 
|  | 124 | +        r"Dr\.", | 
|  | 125 | +        r"Mr\.", | 
|  | 126 | +        r"Ms\.", | 
|  | 127 | +        r"Mrs\.", | 
|  | 128 | +        r"Prof\.", | 
|  | 129 | +        r"Ph\.?D\.", | 
|  | 130 | +        r"Jr\.", | 
|  | 131 | +        r"St\.", | 
|  | 132 | +        r"Mt\.", | 
|  | 133 | +        r"etc\.", | 
|  | 134 | +        r"Fig\.", | 
|  | 135 | +        r"vol\.", | 
|  | 136 | +        r"Vols\.", | 
|  | 137 | +        r"no\.", | 
|  | 138 | +        r"Nos\.", | 
|  | 139 | +        r"et\.", | 
|  | 140 | +        r"al\.", | 
|  | 141 | +        r"i\. ?v\.", | 
|  | 142 | +        r"inc\.", | 
|  | 143 | +        r"Ltd\.", | 
|  | 144 | +        r"Co\.", | 
|  | 145 | +        r"Corp\.", | 
|  | 146 | +        r"Dept\.", | 
|  | 147 | +        r"est\.", | 
|  | 148 | +        r"Asst\.", | 
|  | 149 | +        r"approx\.", | 
|  | 150 | +        r"dr\.", | 
|  | 151 | +        r"fig\.", | 
|  | 152 | +        r"mr\.", | 
|  | 153 | +        r"mrs\.", | 
|  | 154 | +        r"ms\.", | 
|  | 155 | +        r"prof\.", | 
|  | 156 | +        r"rep\.", | 
|  | 157 | +        r"jr\.", | 
|  | 158 | +        r"sen\.", | 
|  | 159 | +        r"st\.", | 
|  | 160 | +        r"vs\.", | 
|  | 161 | +        r"i\. ?e\.", | 
|  | 162 | +    ] | 
|  | 163 | +    for abbr in abbreviations: | 
|  | 164 | +        sentences = re.sub(r"(\b" + abbr + r")\n", r"\1 ", sentences) | 
|  | 165 | + | 
|  | 166 | +    return sentences | 
|  | 167 | + | 
|  | 168 | + | 
|  | 169 | +# Original split sentences function from rulebased_split_sentences | 
|  | 170 | +def split_sentences(text): | 
|  | 171 | +    text = re.sub(r"([?!.])\s", r"\1\n", text) | 
|  | 172 | +    text = postproc_splits(text) | 
|  | 173 | +    return text.split("\n") | 
0 commit comments