Skip to content

Commit 3fb5765

Browse files
reverted to pre-seperator algo, added fix for conditional white space after ?!. chars
1 parent fb1d2d0 commit 3fb5765

File tree

1 file changed

+37
-61
lines changed

1 file changed

+37
-61
lines changed

guardrails/utils/tokenization_utils.py

Lines changed: 37 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,12 @@
88

99

1010
def replace_til_no_change(input_text, pattern, replacement):
11-
while True:
12-
new_text = re.sub(pattern, replacement, input_text)
13-
if new_text == input_text:
14-
break
15-
input_text = new_text
11+
while re.search(pattern, input_text):
12+
input_text = re.sub(pattern, replacement, input_text)
1613
return input_text
1714

1815

19-
def postproc_splits(sentences, separator):
16+
def postproc_splits(sentences):
2017
"""
2118
Applies heuristic rules to repair sentence splitting errors.
2219
Developed for use as postprocessing for the GENIA sentence
@@ -33,83 +30,63 @@ def postproc_splits(sentences, separator):
3330
Which draws in part on heuristics included in Yoshimasa Tsuruoka's
3431
medss.pl script.
3532
"""
36-
3733
# Remove Windows line endings
3834
sentences = sentences.replace("\r", "")
3935

4036
# Breaks sometimes missing after "?", "safe" cases
41-
sentences = re.sub(
42-
r"\b([a-z]+\?)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
43-
)
44-
# Breaks sometimes missing after ".", "safe" cases
45-
sentences = re.sub(
46-
r"\b([a-z]+ \.)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
47-
)
37+
sentences = re.sub(r"\b([a-z]+\?) ([A-Z][a-z]+)\b", r"\1\n\2", sentences)
38+
# Breaks sometimes missing after "." separated with extra space, "safe" cases
39+
sentences = re.sub(r"\b([a-z]+ \.) ([A-Z][a-z]+)\b", r"\1\n\2", sentences)
4840

4941
# No breaks producing lines only containing sentence-ending punctuation
50-
sentences = re.sub(rf"{separator}([.!?]+){separator}", r"\1" + separator, sentences)
42+
sentences = re.sub(r"\n([.!?]+)\n", r"\1\n", sentences)
5143

5244
# No breaks inside parentheses/brackets
45+
# Unlimited length for no intervening parentheses/brackets
5346
sentences = replace_til_no_change(
54-
sentences,
55-
r"\[([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\]",
56-
r"[\1 \2]",
47+
sentences, r"\[([^\[\]\(\)]*)\n([^\[\]\(\)]*)\]", r"[\1 \2]"
5748
)
5849
sentences = replace_til_no_change(
59-
sentences,
60-
r"\(([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\)",
61-
r"(\1 \2)",
50+
sentences, r"\(([^\[\]\(\)]*)\n([^\[\]\(\)]*)\)", r"(\1 \2)"
6251
)
6352
# Standard mismatched with possible intervening
6453
sentences = replace_til_no_change(
65-
sentences,
66-
r"\[([^\[\]]{0,250})" + re.escape(separator) + r"([^\[\]]{0,250})\]",
67-
r"[\1 \2]",
54+
sentences, r"\[([^\[\]]{0,250})\n([^\[\]]{0,250})\]", r"[\1 \2]"
6855
)
6956
sentences = replace_til_no_change(
70-
sentences,
71-
r"\(([^\(\)]{0,250})" + re.escape(separator) + r"([^\(\)]{0,250})\)",
72-
r"(\1 \2)",
57+
sentences, r"\(([^\(\)]{0,250})\n([^\(\)]{0,250})\)", r"(\1 \2)"
7358
)
7459

75-
# Line breaks within quotes
60+
# Guardrails mods for line breaks within quotes
7661
sentences = replace_til_no_change(
77-
sentences,
78-
r'"([^"\n]{0,250})' + re.escape(separator) + r'([^"\n]{0,250})"',
79-
r'"\1 \2"',
62+
sentences, r'"([^"\n]{0,250})\n([^"\n]{0,250})"', r'"\1 \2"'
8063
)
8164
sentences = replace_til_no_change(
82-
sentences,
83-
r"'([^'\n]{0,250})" + re.escape(separator) + r"([^'\n]{0,250})'",
84-
r"'\1 \2'",
65+
sentences, r"'([^'\n]{0,250})\n([^'\n]{0,250})'", r"'\1 \2'"
8566
)
8667

8768
# Nesting to depth one
8869
sentences = replace_til_no_change(
8970
sentences,
90-
r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})"
91-
+ re.escape(separator)
92-
+ r"((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]",
71+
r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})\n((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]",
9372
r"[\1 \2]",
9473
)
9574
sentences = replace_til_no_change(
9675
sentences,
97-
r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})"
98-
+ re.escape(separator)
99-
+ r"((?:[^\(\)]|\([^\(\)]*\)){0,250})\)",
76+
r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})\n((?:[^\(\)]|\([^\(\)]*\)){0,250})\)",
10077
r"(\1 \2)",
10178
)
10279

10380
# No break after periods followed by a non-uppercase "normal word"
104-
sentences = re.sub(rf"\.{separator}([a-z]{{3,}}[a-z-]*[ .:,])", r". \1", sentences)
81+
sentences = re.sub(r"\.\n([a-z]{3}[a-z-]*[ .:,])", r". \1", sentences)
10582

10683
# No break after a single letter other than I
107-
sentences = re.sub(rf"(\b[A-HJ-Z]\.){separator}", r"\1 ", sentences)
84+
sentences = re.sub(r"(\b[A-HJ-Z]\.)\n", r"\1 ", sentences)
10885

10986
# No break before coordinating conjunctions (CC)
11087
coordinating_conjunctions = ["and", "or", "but", "nor", "yet"]
11188
for cc in coordinating_conjunctions:
112-
sentences = re.sub(rf"{separator}({cc}\s)", r" \1", sentences)
89+
sentences = re.sub(r"\n(" + cc + r" )", r" \1", sentences)
11390

11491
# No break before prepositions (IN)
11592
prepositions = [
@@ -138,18 +115,18 @@ def postproc_splits(sentences, separator):
138115
"whether",
139116
]
140117
for prep in prepositions:
141-
sentences = re.sub(rf"{separator}({prep}\s)", r" \1", sentences)
118+
sentences = re.sub(r"\n(" + prep + r" )", r" \1", sentences)
142119

143120
# No sentence breaks in the middle of specific abbreviations
144-
sentences = re.sub(rf"(\be\.){separator}(g\.)", r"\1 \2", sentences)
145-
sentences = re.sub(rf"(\bi\.){separator}(e\.)", r"\1 \2", sentences)
146-
sentences = re.sub(rf"(\bi\.){separator}(v\.)", r"\1 \2", sentences)
121+
sentences = re.sub(r"(\be\.)\n(g\.)", r"\1 \2", sentences)
122+
sentences = re.sub(r"(\bi\.)\n(e\.)", r"\1 \2", sentences)
123+
sentences = re.sub(r"(\bi\.)\n(v\.)", r"\1 \2", sentences)
147124

148125
# No sentence break after specific abbreviations
149126
abbreviations = [
150-
r"e\.?g\.",
151-
r"i\.?e\.",
152-
r"i\.?v\.",
127+
r"e\. ?g\.",
128+
r"i\. ?e\.",
129+
r"i\. ?v\.",
153130
r"vs\.",
154131
r"cf\.",
155132
r"Dr\.",
@@ -165,11 +142,12 @@ def postproc_splits(sentences, separator):
165142
r"Fig\.",
166143
r"vol\.",
167144
r"Vols\.",
168-
r"No\.",
145+
r"no\.",
169146
r"Nos\.",
170147
r"et\.",
171148
r"al\.",
172-
r"Inc\.",
149+
r"i\. ?v\.",
150+
r"inc\.",
173151
r"Ltd\.",
174152
r"Co\.",
175153
r"Corp\.",
@@ -188,18 +166,16 @@ def postproc_splits(sentences, separator):
188166
r"sen\.",
189167
r"st\.",
190168
r"vs\.",
191-
r"i\.?e\.",
169+
r"i\. ?e\.",
192170
]
193171
for abbr in abbreviations:
194-
sentences = re.sub(
195-
rf"(\b{abbr}){separator}", r"\1 ", sentences, flags=re.IGNORECASE
196-
)
172+
sentences = re.sub(r"(\b" + abbr + r")\n", r"\1 ", sentences)
197173

198174
return sentences
199175

200176

201-
def split_sentences(text, separator="SENTENCEBREAK"):
202-
# Use the separator in the regex
203-
text = re.sub(r"([?!.])(?=\s|$)", rf"\1{separator}", text)
204-
text = postproc_splits(text, separator)
205-
return re.split(rf"\n?{separator} ?\n?", text)
177+
# Original split sentences function from rulebased_split_sentences
178+
def split_sentences(text):
179+
text = re.sub(r"([?!.])(\s)?", r"\1\n", text)
180+
text = postproc_splits(text)
181+
return text.split("\n")

0 commit comments

Comments
 (0)