Skip to content

Commit 416f290

Browse files
Added rule based sentence tokenization from WordTokenizers.jl with minor modifications
1 parent c1f7350 commit 416f290

File tree

1 file changed

+173
-0
lines changed

1 file changed

+173
-0
lines changed
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
# This file contains code adapted from the WordTokenizers.jl
2+
# https://github.com/JuliaText/WordTokenizers.jl project.
3+
# It is subject to the license terms in the Apache License file
4+
# found in the top-level directory of this distribution.
5+
# This file has been modified by Guardrails AI on September 27 2024.
6+
7+
import re
8+
9+
10+
def replace_til_no_change(input_text, pattern, replacement):
11+
while re.search(pattern, input_text):
12+
input_text = re.sub(pattern, replacement, input_text)
13+
return input_text
14+
15+
16+
def postproc_splits(sentences):
17+
"""
18+
Applies heuristic rules to repair sentence splitting errors.
19+
Developed for use as postprocessing for the GENIA sentence
20+
splitter on PubMed abstracts, with minor tweaks for
21+
full-text documents.
22+
23+
`sentences` should be a string, with line breaks on sentence boundaries.
24+
Returns a similar string, but more correct.
25+
26+
Based on
27+
https://github.com/ninjin/geniass/blob/master/geniass-postproc.pl
28+
Which is
29+
(c) 2010 Sampo Pyysalo. No rights reserved, i.e. do whatever you like with this.
30+
Which draws in part on heuristics included in Yoshimasa Tsuruoka's
31+
medss.pl script.
32+
"""
33+
# Remove Windows line endings
34+
sentences = sentences.replace("\r", "")
35+
36+
# Breaks sometimes missing after "?", "safe" cases
37+
sentences = re.sub(r"\b([a-z]+\?) ([A-Z][a-z]+)\b", r"\1\n\2", sentences)
38+
# Breaks sometimes missing after "." separated with extra space, "safe" cases
39+
sentences = re.sub(r"\b([a-z]+ \.) ([A-Z][a-z]+)\b", r"\1\n\2", sentences)
40+
41+
# No breaks producing lines only containing sentence-ending punctuation
42+
sentences = re.sub(r"\n([.!?]+)\n", r"\1\n", sentences)
43+
44+
# No breaks inside parentheses/brackets
45+
# Unlimited length for no intervening parentheses/brackets
46+
sentences = replace_til_no_change(
47+
sentences, r"\[([^\[\]\(\)]*)\n([^\[\]\(\)]*)\]", r"[\1 \2]"
48+
)
49+
sentences = replace_til_no_change(
50+
sentences, r"\(([^\[\]\(\)]*)\n([^\[\]\(\)]*)\)", r"(\1 \2)"
51+
)
52+
# Standard mismatched with possible intervening
53+
sentences = replace_til_no_change(
54+
sentences, r"\[([^\[\]]{0,250})\n([^\[\]]{0,250})\]", r"[\1 \2]"
55+
)
56+
sentences = replace_til_no_change(
57+
sentences, r"\(([^\(\)]{0,250})\n([^\(\)]{0,250})\)", r"(\1 \2)"
58+
)
59+
60+
# Nesting to depth one
61+
sentences = replace_til_no_change(
62+
sentences,
63+
r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})\n((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]",
64+
r"[\1 \2]",
65+
)
66+
sentences = replace_til_no_change(
67+
sentences,
68+
r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})\n((?:[^\(\)]|\([^\(\)]*\)){0,250})\)",
69+
r"(\1 \2)",
70+
)
71+
72+
# No break after periods followed by a non-uppercase "normal word"
73+
sentences = re.sub(r"\.\n([a-z]{3}[a-z-]*[ .:,])", r". \1", sentences)
74+
75+
# No break after a single letter other than I
76+
sentences = re.sub(r"(\b[A-HJ-Z]\.)\n", r"\1 ", sentences)
77+
78+
# No break before coordinating conjunctions (CC)
79+
coordinating_conjunctions = ["and", "or", "but", "nor", "yet"]
80+
for cc in coordinating_conjunctions:
81+
sentences = re.sub(r"\n(" + cc + r" )", r" \1", sentences)
82+
83+
# No break before prepositions (IN)
84+
prepositions = [
85+
"of",
86+
"in",
87+
"by",
88+
"as",
89+
"on",
90+
"at",
91+
"to",
92+
"via",
93+
"for",
94+
"with",
95+
"that",
96+
"than",
97+
"from",
98+
"into",
99+
"upon",
100+
"after",
101+
"while",
102+
"during",
103+
"within",
104+
"through",
105+
"between",
106+
"whereas",
107+
"whether",
108+
]
109+
for prep in prepositions:
110+
sentences = re.sub(r"\n(" + prep + r" )", r" \1", sentences)
111+
112+
# No sentence breaks in the middle of specific abbreviations
113+
sentences = re.sub(r"(\be\.)\n(g\.)", r"\1 \2", sentences)
114+
sentences = re.sub(r"(\bi\.)\n(e\.)", r"\1 \2", sentences)
115+
sentences = re.sub(r"(\bi\.)\n(v\.)", r"\1 \2", sentences)
116+
117+
# No sentence break after specific abbreviations
118+
abbreviations = [
119+
r"e\. ?g\.",
120+
r"i\. ?e\.",
121+
r"i\. ?v\.",
122+
r"vs\.",
123+
r"cf\.",
124+
r"Dr\.",
125+
r"Mr\.",
126+
r"Ms\.",
127+
r"Mrs\.",
128+
r"Prof\.",
129+
r"Ph\.?D\.",
130+
r"Jr\.",
131+
r"St\.",
132+
r"Mt\.",
133+
r"etc\.",
134+
r"Fig\.",
135+
r"vol\.",
136+
r"Vols\.",
137+
r"no\.",
138+
r"Nos\.",
139+
r"et\.",
140+
r"al\.",
141+
r"i\. ?v\.",
142+
r"inc\.",
143+
r"Ltd\.",
144+
r"Co\.",
145+
r"Corp\.",
146+
r"Dept\.",
147+
r"est\.",
148+
r"Asst\.",
149+
r"approx\.",
150+
r"dr\.",
151+
r"fig\.",
152+
r"mr\.",
153+
r"mrs\.",
154+
r"ms\.",
155+
r"prof\.",
156+
r"rep\.",
157+
r"jr\.",
158+
r"sen\.",
159+
r"st\.",
160+
r"vs\.",
161+
r"i\. ?e\.",
162+
]
163+
for abbr in abbreviations:
164+
sentences = re.sub(r"(\b" + abbr + r")\n", r"\1 ", sentences)
165+
166+
return sentences
167+
168+
169+
# Original split sentences function from rulebased_split_sentences
170+
def split_sentences(text):
171+
text = re.sub(r"([?!.])\s", r"\1\n", text)
172+
text = postproc_splits(text)
173+
return text.split("\n")

0 commit comments

Comments
 (0)