88
99
1010def replace_til_no_change (input_text , pattern , replacement ):
11- while re .search (pattern , input_text ):
12- input_text = re .sub (pattern , replacement , input_text )
11+ while True :
12+ new_text = re .sub (pattern , replacement , input_text )
13+ if new_text == input_text :
14+ break
15+ input_text = new_text
1316 return input_text
1417
1518
16- def postproc_splits (sentences ):
19+ def postproc_splits (sentences , separator ):
1720 """
1821 Applies heuristic rules to repair sentence splitting errors.
1922 Developed for use as postprocessing for the GENIA sentence
@@ -30,63 +33,83 @@ def postproc_splits(sentences):
3033 Which draws in part on heuristics included in Yoshimasa Tsuruoka's
3134 medss.pl script.
3235 """
36+
3337 # Remove Windows line endings
3438 sentences = sentences .replace ("\r " , "" )
3539
3640 # Breaks sometimes missing after "?", "safe" cases
37- sentences = re .sub (r"\b([a-z]+\?) ([A-Z][a-z]+)\b" , r"\1\n\2" , sentences )
38- # Breaks sometimes missing after "." separated with extra space, "safe" cases
39- sentences = re .sub (r"\b([a-z]+ \.) ([A-Z][a-z]+)\b" , r"\1\n\2" , sentences )
41+ sentences = re .sub (
42+ r"\b([a-z]+\?)\s+([A-Z][a-z]+)\b" , rf"\1{ separator } \2" , sentences
43+ )
44+ # Breaks sometimes missing after ".", "safe" cases
45+ sentences = re .sub (
46+ r"\b([a-z]+ \.)\s+([A-Z][a-z]+)\b" , rf"\1{ separator } \2" , sentences
47+ )
4048
4149 # No breaks producing lines only containing sentence-ending punctuation
42- sentences = re .sub (r"\n ([.!?]+)\n " , r"\1\n" , sentences )
50+ sentences = re .sub (rf" { separator } ([.!?]+){ separator } " , r"\1" + separator , sentences )
4351
4452 # No breaks inside parentheses/brackets
45- # Unlimited length for no intervening parentheses/brackets
4653 sentences = replace_til_no_change (
47- sentences , r"\[([^\[\]\(\)]*)\n([^\[\]\(\)]*)\]" , r"[\1 \2]"
54+ sentences ,
55+ r"\[([^\[\]\(\)]*)" + re .escape (separator ) + r"([^\[\]\(\)]*)\]" ,
56+ r"[\1 \2]" ,
4857 )
4958 sentences = replace_til_no_change (
50- sentences , r"\(([^\[\]\(\)]*)\n([^\[\]\(\)]*)\)" , r"(\1 \2)"
59+ sentences ,
60+ r"\(([^\[\]\(\)]*)" + re .escape (separator ) + r"([^\[\]\(\)]*)\)" ,
61+ r"(\1 \2)" ,
5162 )
5263 # Standard mismatched with possible intervening
5364 sentences = replace_til_no_change (
54- sentences , r"\[([^\[\]]{0,250})\n([^\[\]]{0,250})\]" , r"[\1 \2]"
65+ sentences ,
66+ r"\[([^\[\]]{0,250})" + re .escape (separator ) + r"([^\[\]]{0,250})\]" ,
67+ r"[\1 \2]" ,
5568 )
5669 sentences = replace_til_no_change (
57- sentences , r"\(([^\(\)]{0,250})\n([^\(\)]{0,250})\)" , r"(\1 \2)"
70+ sentences ,
71+ r"\(([^\(\)]{0,250})" + re .escape (separator ) + r"([^\(\)]{0,250})\)" ,
72+ r"(\1 \2)" ,
5873 )
5974
60- # Guardrails mods for line breaks within quotes
75+ # Line breaks within quotes
6176 sentences = replace_til_no_change (
62- sentences , r'"([^"\n]{0,250})\n([^"\n]{0,250})"' , r'"\1 \2"'
77+ sentences ,
78+ r'"([^"\n]{0,250})' + re .escape (separator ) + r'([^"\n]{0,250})"' ,
79+ r'"\1 \2"' ,
6380 )
6481 sentences = replace_til_no_change (
65- sentences , r"'([^'\n]{0,250})\n([^'\n]{0,250})'" , r"'\1 \2'"
82+ sentences ,
83+ r"'([^'\n]{0,250})" + re .escape (separator ) + r"([^'\n]{0,250})'" ,
84+ r"'\1 \2'" ,
6685 )
6786
6887 # Nesting to depth one
6988 sentences = replace_til_no_change (
7089 sentences ,
71- r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})\n((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]" ,
90+ r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})"
91+ + re .escape (separator )
92+ + r"((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]" ,
7293 r"[\1 \2]" ,
7394 )
7495 sentences = replace_til_no_change (
7596 sentences ,
76- r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})\n((?:[^\(\)]|\([^\(\)]*\)){0,250})\)" ,
97+ r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})"
98+ + re .escape (separator )
99+ + r"((?:[^\(\)]|\([^\(\)]*\)){0,250})\)" ,
77100 r"(\1 \2)" ,
78101 )
79102
80103 # No break after periods followed by a non-uppercase "normal word"
81- sentences = re .sub (r "\.\n ([a-z]{3 }[a-z-]*[ .:,])" , r". \1" , sentences )
104+ sentences = re .sub (rf "\.{ separator } ([a-z]{{3,} }[a-z-]*[ .:,])" , r". \1" , sentences )
82105
83106 # No break after a single letter other than I
84- sentences = re .sub (r "(\b[A-HJ-Z]\.)\n " , r"\1 " , sentences )
107+ sentences = re .sub (rf "(\b[A-HJ-Z]\.){ separator } " , r"\1 " , sentences )
85108
86109 # No break before coordinating conjunctions (CC)
87110 coordinating_conjunctions = ["and" , "or" , "but" , "nor" , "yet" ]
88111 for cc in coordinating_conjunctions :
89- sentences = re .sub (r"\n(" + cc + r" )" , r" \1" , sentences )
112+ sentences = re .sub (rf" { separator } ( { cc } \s )" , r" \1" , sentences )
90113
91114 # No break before prepositions (IN)
92115 prepositions = [
@@ -115,18 +138,18 @@ def postproc_splits(sentences):
115138 "whether" ,
116139 ]
117140 for prep in prepositions :
118- sentences = re .sub (r"\n(" + prep + r" )" , r" \1" , sentences )
141+ sentences = re .sub (rf" { separator } ( { prep } \s )" , r" \1" , sentences )
119142
120143 # No sentence breaks in the middle of specific abbreviations
121- sentences = re .sub (r "(\be\.)\n (g\.)" , r"\1 \2" , sentences )
122- sentences = re .sub (r "(\bi\.)\n (e\.)" , r"\1 \2" , sentences )
123- sentences = re .sub (r "(\bi\.)\n (v\.)" , r"\1 \2" , sentences )
144+ sentences = re .sub (rf "(\be\.){ separator } (g\.)" , r"\1 \2" , sentences )
145+ sentences = re .sub (rf "(\bi\.){ separator } (e\.)" , r"\1 \2" , sentences )
146+ sentences = re .sub (rf "(\bi\.){ separator } (v\.)" , r"\1 \2" , sentences )
124147
125148 # No sentence break after specific abbreviations
126149 abbreviations = [
127- r"e\. ?g\." ,
128- r"i\. ?e\." ,
129- r"i\. ?v\." ,
150+ r"e\.?g\." ,
151+ r"i\.?e\." ,
152+ r"i\.?v\." ,
130153 r"vs\." ,
131154 r"cf\." ,
132155 r"Dr\." ,
@@ -142,12 +165,11 @@ def postproc_splits(sentences):
142165 r"Fig\." ,
143166 r"vol\." ,
144167 r"Vols\." ,
145- r"no \." ,
168+ r"No \." ,
146169 r"Nos\." ,
147170 r"et\." ,
148171 r"al\." ,
149- r"i\. ?v\." ,
150- r"inc\." ,
172+ r"Inc\." ,
151173 r"Ltd\." ,
152174 r"Co\." ,
153175 r"Corp\." ,
@@ -166,16 +188,18 @@ def postproc_splits(sentences):
166188 r"sen\." ,
167189 r"st\." ,
168190 r"vs\." ,
169- r"i\. ?e\." ,
191+ r"i\.?e\." ,
170192 ]
171193 for abbr in abbreviations :
172- sentences = re .sub (r"(\b" + abbr + r")\n" , r"\1 " , sentences )
194+ sentences = re .sub (
195+ rf"(\b{ abbr } ){ separator } " , r"\1 " , sentences , flags = re .IGNORECASE
196+ )
173197
174198 return sentences
175199
176200
177- # Original split sentences function from rulebased_split_sentences
178- def split_sentences ( text ):
179- text = re .sub (r"([?!.])\s " , r "\1\n " , text )
180- text = postproc_splits (text )
181- return text .split ("\n " )
201+ def split_sentences ( text , separator = "SENTENCEBREAK" ):
202+ # Use the separator in the regex
203+ text = re .sub (r"([?!.])(?=\s|$) " , rf "\1{ separator } " , text )
204+ text = postproc_splits (text , separator )
205+ return re .split (rf "\n? { separator } ?\n?" , text )
0 commit comments