88
99
1010def replace_til_no_change (input_text , pattern , replacement ):
11- while True :
12- new_text = re .sub (pattern , replacement , input_text )
13- if new_text == input_text :
14- break
15- input_text = new_text
11+ while re .search (pattern , input_text ):
12+ input_text = re .sub (pattern , replacement , input_text )
1613 return input_text
1714
1815
19- def postproc_splits (sentences , separator ):
16+ def postproc_splits (sentences ):
2017 """
2118 Applies heuristic rules to repair sentence splitting errors.
2219 Developed for use as postprocessing for the GENIA sentence
@@ -33,83 +30,63 @@ def postproc_splits(sentences, separator):
3330 Which draws in part on heuristics included in Yoshimasa Tsuruoka's
3431 medss.pl script.
3532 """
36-
3733 # Remove Windows line endings
3834 sentences = sentences .replace ("\r " , "" )
3935
4036 # Breaks sometimes missing after "?", "safe" cases
41- sentences = re .sub (
42- r"\b([a-z]+\?)\s+([A-Z][a-z]+)\b" , rf"\1{ separator } \2" , sentences
43- )
44- # Breaks sometimes missing after ".", "safe" cases
45- sentences = re .sub (
46- r"\b([a-z]+ \.)\s+([A-Z][a-z]+)\b" , rf"\1{ separator } \2" , sentences
47- )
37+ sentences = re .sub (r"\b([a-z]+\?) ([A-Z][a-z]+)\b" , r"\1\n\2" , sentences )
38+ # Breaks sometimes missing after "." separated with extra space, "safe" cases
39+ sentences = re .sub (r"\b([a-z]+ \.) ([A-Z][a-z]+)\b" , r"\1\n\2" , sentences )
4840
4941 # No breaks producing lines only containing sentence-ending punctuation
50- sentences = re .sub (rf" { separator } ([.!?]+){ separator } " , r"\1" + separator , sentences )
42+ sentences = re .sub (r"\n ([.!?]+)\n " , r"\1\n" , sentences )
5143
5244 # No breaks inside parentheses/brackets
45+ # Unlimited length for no intervening parentheses/brackets
5346 sentences = replace_til_no_change (
54- sentences ,
55- r"\[([^\[\]\(\)]*)" + re .escape (separator ) + r"([^\[\]\(\)]*)\]" ,
56- r"[\1 \2]" ,
47+ sentences , r"\[([^\[\]\(\)]*)\n([^\[\]\(\)]*)\]" , r"[\1 \2]"
5748 )
5849 sentences = replace_til_no_change (
59- sentences ,
60- r"\(([^\[\]\(\)]*)" + re .escape (separator ) + r"([^\[\]\(\)]*)\)" ,
61- r"(\1 \2)" ,
50+ sentences , r"\(([^\[\]\(\)]*)\n([^\[\]\(\)]*)\)" , r"(\1 \2)"
6251 )
6352 # Standard mismatched with possible intervening
6453 sentences = replace_til_no_change (
65- sentences ,
66- r"\[([^\[\]]{0,250})" + re .escape (separator ) + r"([^\[\]]{0,250})\]" ,
67- r"[\1 \2]" ,
54+ sentences , r"\[([^\[\]]{0,250})\n([^\[\]]{0,250})\]" , r"[\1 \2]"
6855 )
6956 sentences = replace_til_no_change (
70- sentences ,
71- r"\(([^\(\)]{0,250})" + re .escape (separator ) + r"([^\(\)]{0,250})\)" ,
72- r"(\1 \2)" ,
57+ sentences , r"\(([^\(\)]{0,250})\n([^\(\)]{0,250})\)" , r"(\1 \2)"
7358 )
7459
75- # Line breaks within quotes
60+ # Guardrails mods for line breaks within quotes
7661 sentences = replace_til_no_change (
77- sentences ,
78- r'"([^"\n]{0,250})' + re .escape (separator ) + r'([^"\n]{0,250})"' ,
79- r'"\1 \2"' ,
62+ sentences , r'"([^"\n]{0,250})\n([^"\n]{0,250})"' , r'"\1 \2"'
8063 )
8164 sentences = replace_til_no_change (
82- sentences ,
83- r"'([^'\n]{0,250})" + re .escape (separator ) + r"([^'\n]{0,250})'" ,
84- r"'\1 \2'" ,
65+ sentences , r"'([^'\n]{0,250})\n([^'\n]{0,250})'" , r"'\1 \2'"
8566 )
8667
8768 # Nesting to depth one
8869 sentences = replace_til_no_change (
8970 sentences ,
90- r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})"
91- + re .escape (separator )
92- + r"((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]" ,
71+ r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})\n((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]" ,
9372 r"[\1 \2]" ,
9473 )
9574 sentences = replace_til_no_change (
9675 sentences ,
97- r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})"
98- + re .escape (separator )
99- + r"((?:[^\(\)]|\([^\(\)]*\)){0,250})\)" ,
76+ r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})\n((?:[^\(\)]|\([^\(\)]*\)){0,250})\)" ,
10077 r"(\1 \2)" ,
10178 )
10279
10380 # No break after periods followed by a non-uppercase "normal word"
104- sentences = re .sub (rf "\.{ separator } ([a-z]{{3,} }[a-z-]*[ .:,])" , r". \1" , sentences )
81+ sentences = re .sub (r "\.\n ([a-z]{3 }[a-z-]*[ .:,])" , r". \1" , sentences )
10582
10683 # No break after a single letter other than I
107- sentences = re .sub (rf "(\b[A-HJ-Z]\.){ separator } " , r"\1 " , sentences )
84+ sentences = re .sub (r "(\b[A-HJ-Z]\.)\n " , r"\1 " , sentences )
10885
10986 # No break before coordinating conjunctions (CC)
11087 coordinating_conjunctions = ["and" , "or" , "but" , "nor" , "yet" ]
11188 for cc in coordinating_conjunctions :
112- sentences = re .sub (rf" { separator } ( { cc } \s )" , r" \1" , sentences )
89+ sentences = re .sub (r"\n(" + cc + r" )" , r" \1" , sentences )
11390
11491 # No break before prepositions (IN)
11592 prepositions = [
@@ -138,18 +115,18 @@ def postproc_splits(sentences, separator):
138115 "whether" ,
139116 ]
140117 for prep in prepositions :
141- sentences = re .sub (rf" { separator } ( { prep } \s )" , r" \1" , sentences )
118+ sentences = re .sub (r"\n(" + prep + r" )" , r" \1" , sentences )
142119
143120 # No sentence breaks in the middle of specific abbreviations
144- sentences = re .sub (rf "(\be\.){ separator } (g\.)" , r"\1 \2" , sentences )
145- sentences = re .sub (rf "(\bi\.){ separator } (e\.)" , r"\1 \2" , sentences )
146- sentences = re .sub (rf "(\bi\.){ separator } (v\.)" , r"\1 \2" , sentences )
121+ sentences = re .sub (r "(\be\.)\n (g\.)" , r"\1 \2" , sentences )
122+ sentences = re .sub (r "(\bi\.)\n (e\.)" , r"\1 \2" , sentences )
123+ sentences = re .sub (r "(\bi\.)\n (v\.)" , r"\1 \2" , sentences )
147124
148125 # No sentence break after specific abbreviations
149126 abbreviations = [
150- r"e\.?g\." ,
151- r"i\.?e\." ,
152- r"i\.?v\." ,
127+ r"e\. ?g\." ,
128+ r"i\. ?e\." ,
129+ r"i\. ?v\." ,
153130 r"vs\." ,
154131 r"cf\." ,
155132 r"Dr\." ,
@@ -165,11 +142,12 @@ def postproc_splits(sentences, separator):
165142 r"Fig\." ,
166143 r"vol\." ,
167144 r"Vols\." ,
168- r"No \." ,
145+ r"no \." ,
169146 r"Nos\." ,
170147 r"et\." ,
171148 r"al\." ,
172- r"Inc\." ,
149+ r"i\. ?v\." ,
150+ r"inc\." ,
173151 r"Ltd\." ,
174152 r"Co\." ,
175153 r"Corp\." ,
@@ -188,18 +166,16 @@ def postproc_splits(sentences, separator):
188166 r"sen\." ,
189167 r"st\." ,
190168 r"vs\." ,
191- r"i\.?e\." ,
169+ r"i\. ?e\." ,
192170 ]
193171 for abbr in abbreviations :
194- sentences = re .sub (
195- rf"(\b{ abbr } ){ separator } " , r"\1 " , sentences , flags = re .IGNORECASE
196- )
172+ sentences = re .sub (r"(\b" + abbr + r")\n" , r"\1 " , sentences )
197173
198174 return sentences
199175
200176
201- def split_sentences ( text , separator = "SENTENCEBREAK" ):
202- # Use the separator in the regex
203- text = re .sub (r"([?!.])(?=\s|$) " , rf "\1{ separator } " , text )
204- text = postproc_splits (text , separator )
205- return re .split (rf "\n? { separator } ?\n?" , text )
177+ # Original split sentences function from rulebased_split_sentences
178+ def split_sentences ( text ):
179+ text = re .sub (r"([?!.])(\s)? " , r "\1\n " , text )
180+ text = postproc_splits (text )
181+ return text .split ("\n " )
0 commit comments