Skip to content

Commit bd0ab50

Browse files
testing changes using custom separator in wordtokenizer algo
1 parent 1a2546a commit bd0ab50

File tree

2 files changed

+71
-41
lines changed

2 files changed

+71
-41
lines changed

guardrails/utils/tokenization_utils.py

Lines changed: 61 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,15 @@
88

99

1010
def replace_til_no_change(input_text, pattern, replacement):
11-
while re.search(pattern, input_text):
12-
input_text = re.sub(pattern, replacement, input_text)
11+
while True:
12+
new_text = re.sub(pattern, replacement, input_text)
13+
if new_text == input_text:
14+
break
15+
input_text = new_text
1316
return input_text
1417

1518

16-
def postproc_splits(sentences):
19+
def postproc_splits(sentences, separator):
1720
"""
1821
Applies heuristic rules to repair sentence splitting errors.
1922
Developed for use as postprocessing for the GENIA sentence
@@ -30,63 +33,83 @@ def postproc_splits(sentences):
3033
Which draws in part on heuristics included in Yoshimasa Tsuruoka's
3134
medss.pl script.
3235
"""
36+
3337
# Remove Windows line endings
3438
sentences = sentences.replace("\r", "")
3539

3640
# Breaks sometimes missing after "?", "safe" cases
37-
sentences = re.sub(r"\b([a-z]+\?) ([A-Z][a-z]+)\b", r"\1\n\2", sentences)
38-
# Breaks sometimes missing after "." separated with extra space, "safe" cases
39-
sentences = re.sub(r"\b([a-z]+ \.) ([A-Z][a-z]+)\b", r"\1\n\2", sentences)
41+
sentences = re.sub(
42+
r"\b([a-z]+\?)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
43+
)
44+
# Breaks sometimes missing after ".", "safe" cases
45+
sentences = re.sub(
46+
r"\b([a-z]+ \.)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
47+
)
4048

4149
# No breaks producing lines only containing sentence-ending punctuation
42-
sentences = re.sub(r"\n([.!?]+)\n", r"\1\n", sentences)
50+
sentences = re.sub(rf"{separator}([.!?]+){separator}", r"\1" + separator, sentences)
4351

4452
# No breaks inside parentheses/brackets
45-
# Unlimited length for no intervening parentheses/brackets
4653
sentences = replace_til_no_change(
47-
sentences, r"\[([^\[\]\(\)]*)\n([^\[\]\(\)]*)\]", r"[\1 \2]"
54+
sentences,
55+
r"\[([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\]",
56+
r"[\1 \2]",
4857
)
4958
sentences = replace_til_no_change(
50-
sentences, r"\(([^\[\]\(\)]*)\n([^\[\]\(\)]*)\)", r"(\1 \2)"
59+
sentences,
60+
r"\(([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\)",
61+
r"(\1 \2)",
5162
)
5263
# Standard mismatched with possible intervening
5364
sentences = replace_til_no_change(
54-
sentences, r"\[([^\[\]]{0,250})\n([^\[\]]{0,250})\]", r"[\1 \2]"
65+
sentences,
66+
r"\[([^\[\]]{0,250})" + re.escape(separator) + r"([^\[\]]{0,250})\]",
67+
r"[\1 \2]",
5568
)
5669
sentences = replace_til_no_change(
57-
sentences, r"\(([^\(\)]{0,250})\n([^\(\)]{0,250})\)", r"(\1 \2)"
70+
sentences,
71+
r"\(([^\(\)]{0,250})" + re.escape(separator) + r"([^\(\)]{0,250})\)",
72+
r"(\1 \2)",
5873
)
5974

60-
# Guardrails mods for line breaks within quotes
75+
# Line breaks within quotes
6176
sentences = replace_til_no_change(
62-
sentences, r'"([^"\n]{0,250})\n([^"\n]{0,250})"', r'"\1 \2"'
77+
sentences,
78+
r'"([^"\n]{0,250})' + re.escape(separator) + r'([^"\n]{0,250})"',
79+
r'"\1 \2"',
6380
)
6481
sentences = replace_til_no_change(
65-
sentences, r"'([^'\n]{0,250})\n([^'\n]{0,250})'", r"'\1 \2'"
82+
sentences,
83+
r"'([^'\n]{0,250})" + re.escape(separator) + r"([^'\n]{0,250})'",
84+
r"'\1 \2'",
6685
)
6786

6887
# Nesting to depth one
6988
sentences = replace_til_no_change(
7089
sentences,
71-
r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})\n((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]",
90+
r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})"
91+
+ re.escape(separator)
92+
+ r"((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]",
7293
r"[\1 \2]",
7394
)
7495
sentences = replace_til_no_change(
7596
sentences,
76-
r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})\n((?:[^\(\)]|\([^\(\)]*\)){0,250})\)",
97+
r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})"
98+
+ re.escape(separator)
99+
+ r"((?:[^\(\)]|\([^\(\)]*\)){0,250})\)",
77100
r"(\1 \2)",
78101
)
79102

80103
# No break after periods followed by a non-uppercase "normal word"
81-
sentences = re.sub(r"\.\n([a-z]{3}[a-z-]*[ .:,])", r". \1", sentences)
104+
sentences = re.sub(rf"\.{separator}([a-z]{{3,}}[a-z-]*[ .:,])", r". \1", sentences)
82105

83106
# No break after a single letter other than I
84-
sentences = re.sub(r"(\b[A-HJ-Z]\.)\n", r"\1 ", sentences)
107+
sentences = re.sub(rf"(\b[A-HJ-Z]\.){separator}", r"\1 ", sentences)
85108

86109
# No break before coordinating conjunctions (CC)
87110
coordinating_conjunctions = ["and", "or", "but", "nor", "yet"]
88111
for cc in coordinating_conjunctions:
89-
sentences = re.sub(r"\n(" + cc + r" )", r" \1", sentences)
112+
sentences = re.sub(rf"{separator}({cc}\s)", r" \1", sentences)
90113

91114
# No break before prepositions (IN)
92115
prepositions = [
@@ -115,18 +138,18 @@ def postproc_splits(sentences):
115138
"whether",
116139
]
117140
for prep in prepositions:
118-
sentences = re.sub(r"\n(" + prep + r" )", r" \1", sentences)
141+
sentences = re.sub(rf"{separator}({prep}\s)", r" \1", sentences)
119142

120143
# No sentence breaks in the middle of specific abbreviations
121-
sentences = re.sub(r"(\be\.)\n(g\.)", r"\1 \2", sentences)
122-
sentences = re.sub(r"(\bi\.)\n(e\.)", r"\1 \2", sentences)
123-
sentences = re.sub(r"(\bi\.)\n(v\.)", r"\1 \2", sentences)
144+
sentences = re.sub(rf"(\be\.){separator}(g\.)", r"\1 \2", sentences)
145+
sentences = re.sub(rf"(\bi\.){separator}(e\.)", r"\1 \2", sentences)
146+
sentences = re.sub(rf"(\bi\.){separator}(v\.)", r"\1 \2", sentences)
124147

125148
# No sentence break after specific abbreviations
126149
abbreviations = [
127-
r"e\. ?g\.",
128-
r"i\. ?e\.",
129-
r"i\. ?v\.",
150+
r"e\.?g\.",
151+
r"i\.?e\.",
152+
r"i\.?v\.",
130153
r"vs\.",
131154
r"cf\.",
132155
r"Dr\.",
@@ -142,12 +165,11 @@ def postproc_splits(sentences):
142165
r"Fig\.",
143166
r"vol\.",
144167
r"Vols\.",
145-
r"no\.",
168+
r"No\.",
146169
r"Nos\.",
147170
r"et\.",
148171
r"al\.",
149-
r"i\. ?v\.",
150-
r"inc\.",
172+
r"Inc\.",
151173
r"Ltd\.",
152174
r"Co\.",
153175
r"Corp\.",
@@ -166,16 +188,18 @@ def postproc_splits(sentences):
166188
r"sen\.",
167189
r"st\.",
168190
r"vs\.",
169-
r"i\. ?e\.",
191+
r"i\.?e\.",
170192
]
171193
for abbr in abbreviations:
172-
sentences = re.sub(r"(\b" + abbr + r")\n", r"\1 ", sentences)
194+
sentences = re.sub(
195+
rf"(\b{abbr}){separator}", r"\1 ", sentences, flags=re.IGNORECASE
196+
)
173197

174198
return sentences
175199

176200

177-
# Original split sentences function from rulebased_split_sentences
178-
def split_sentences(text):
179-
text = re.sub(r"([?!.])\s", r"\1\n", text)
180-
text = postproc_splits(text)
181-
return text.split("\n")
201+
def split_sentences(text, separator="SENTENCEBREAK"):
202+
# Use the separator in the regex
203+
text = re.sub(r"([?!.])(?=\s|$)", rf"\1{separator}", text)
204+
text = postproc_splits(text, separator)
205+
return re.split(rf"\n?{separator} ?\n?", text)

guardrails/validator_base.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,9 @@ def split_sentence_str(chunk: str):
4444
return [fragments[0] + ".", ".".join(fragments[1:])]
4545

4646

47-
def split_sentence_word_tokenizers_jl(chunk: str):
47+
def split_sentence_word_tokenizers_jl(
48+
chunk: str, separator: str = "SENTENCEBREAK"
49+
) -> List[str]:
4850
"""
4951
Use a sentence tokenizer to detect if at least one sentence is present in the chunk.
5052
We return the first sentence and the remaining chunks without the first sentence.
@@ -69,12 +71,16 @@ def split_sentence_word_tokenizers_jl(chunk: str):
6971
is_minimum_length = True
7072

7173
# check for potential line endings, which is what split_sentences does
72-
chunk_with_potential_line_endings, count = re.subn(r"([?!.])\s", r"\1\n", chunk)
73-
any_potential_line_endings = count > 0
74+
chunk_with_potential_line_endings, count = re.sub(
75+
r"([?!.])(?=\s|$)", rf"\1{separator}", chunk
76+
)
77+
any_potential_line_endings = int(count) > 0
7478
if not is_minimum_length or not any_potential_line_endings:
7579
return []
7680

77-
sentences = postproc_splits(chunk_with_potential_line_endings).split("\n")
81+
sentences = postproc_splits(chunk_with_potential_line_endings, separator).split(
82+
"\n"
83+
)
7884
# if not more than one sentence, we haven't accumulated enough for a validation
7985
if len(sentences) <= 1:
8086
return []

0 commit comments

Comments
 (0)