88
99
1010def  replace_til_no_change (input_text , pattern , replacement ):
11-     while  re .search (pattern , input_text ):
12-         input_text  =  re .sub (pattern , replacement , input_text )
11+     while  True :
12+         new_text  =  re .sub (pattern , replacement , input_text )
13+         if  new_text  ==  input_text :
14+             break 
15+         input_text  =  new_text 
1316    return  input_text 
1417
1518
16- def  postproc_splits (sentences ):
19+ def  postproc_splits (sentences ,  separator ):
1720    """ 
1821    Applies heuristic rules to repair sentence splitting errors. 
1922    Developed for use as postprocessing for the GENIA sentence 
@@ -30,63 +33,83 @@ def postproc_splits(sentences):
3033    Which draws in part on heuristics included in Yoshimasa Tsuruoka's 
3134    medss.pl script. 
3235    """ 
36+ 
3337    # Remove Windows line endings 
3438    sentences  =  sentences .replace ("\r " , "" )
3539
3640    # Breaks sometimes missing after "?", "safe" cases 
37-     sentences  =  re .sub (r"\b([a-z]+\?) ([A-Z][a-z]+)\b" , r"\1\n\2" , sentences )
38-     # Breaks sometimes missing after "." separated with extra space, "safe" cases 
39-     sentences  =  re .sub (r"\b([a-z]+ \.) ([A-Z][a-z]+)\b" , r"\1\n\2" , sentences )
41+     sentences  =  re .sub (
42+         r"\b([a-z]+\?)\s+([A-Z][a-z]+)\b" , rf"\1{ separator }  \2" , sentences 
43+     )
44+     # Breaks sometimes missing after ".", "safe" cases 
45+     sentences  =  re .sub (
46+         r"\b([a-z]+ \.)\s+([A-Z][a-z]+)\b" , rf"\1{ separator }  \2" , sentences 
47+     )
4048
4149    # No breaks producing lines only containing sentence-ending punctuation 
42-     sentences  =  re .sub (r"\n ([.!?]+)\n " , r"\1\n"  , sentences )
50+     sentences  =  re .sub (rf" { separator }  ([.!?]+){ separator }  " , r"\1"    +   separator , sentences )
4351
4452    # No breaks inside parentheses/brackets 
45-     # Unlimited length for no intervening parentheses/brackets 
4653    sentences  =  replace_til_no_change (
47-         sentences , r"\[([^\[\]\(\)]*)\n([^\[\]\(\)]*)\]" , r"[\1 \2]" 
54+         sentences ,
55+         r"\[([^\[\]\(\)]*)"  +  re .escape (separator ) +  r"([^\[\]\(\)]*)\]" ,
56+         r"[\1 \2]" ,
4857    )
4958    sentences  =  replace_til_no_change (
50-         sentences , r"\(([^\[\]\(\)]*)\n([^\[\]\(\)]*)\)" , r"(\1 \2)" 
59+         sentences ,
60+         r"\(([^\[\]\(\)]*)"  +  re .escape (separator ) +  r"([^\[\]\(\)]*)\)" ,
61+         r"(\1 \2)" ,
5162    )
5263    # Standard mismatched with possible intervening 
5364    sentences  =  replace_til_no_change (
54-         sentences , r"\[([^\[\]]{0,250})\n([^\[\]]{0,250})\]" , r"[\1 \2]" 
65+         sentences ,
66+         r"\[([^\[\]]{0,250})"  +  re .escape (separator ) +  r"([^\[\]]{0,250})\]" ,
67+         r"[\1 \2]" ,
5568    )
5669    sentences  =  replace_til_no_change (
57-         sentences , r"\(([^\(\)]{0,250})\n([^\(\)]{0,250})\)" , r"(\1 \2)" 
70+         sentences ,
71+         r"\(([^\(\)]{0,250})"  +  re .escape (separator ) +  r"([^\(\)]{0,250})\)" ,
72+         r"(\1 \2)" ,
5873    )
5974
60-     # Guardrails mods for line  breaks within quotes 
75+     # Line  breaks within quotes 
6176    sentences  =  replace_til_no_change (
62-         sentences , r'"([^"\n]{0,250})\n([^"\n]{0,250})"' , r'"\1 \2"' 
77+         sentences ,
78+         r'"([^"\n]{0,250})'  +  re .escape (separator ) +  r'([^"\n]{0,250})"' ,
79+         r'"\1 \2"' ,
6380    )
6481    sentences  =  replace_til_no_change (
65-         sentences , r"'([^'\n]{0,250})\n([^'\n]{0,250})'" , r"'\1 \2'" 
82+         sentences ,
83+         r"'([^'\n]{0,250})"  +  re .escape (separator ) +  r"([^'\n]{0,250})'" ,
84+         r"'\1 \2'" ,
6685    )
6786
6887    # Nesting to depth one 
6988    sentences  =  replace_til_no_change (
7089        sentences ,
71-         r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})\n((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]" ,
90+         r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})" 
91+         +  re .escape (separator )
92+         +  r"((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]" ,
7293        r"[\1 \2]" ,
7394    )
7495    sentences  =  replace_til_no_change (
7596        sentences ,
76-         r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})\n((?:[^\(\)]|\([^\(\)]*\)){0,250})\)" ,
97+         r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})" 
98+         +  re .escape (separator )
99+         +  r"((?:[^\(\)]|\([^\(\)]*\)){0,250})\)" ,
77100        r"(\1 \2)" ,
78101    )
79102
80103    # No break after periods followed by a non-uppercase "normal word" 
81-     sentences  =  re .sub (r "\.\n ([a-z]{3 }[a-z-]*[ .:,])" , r". \1" , sentences )
104+     sentences  =  re .sub (rf "\.{ separator }  ([a-z]{{3,} }[a-z-]*[ .:,])" , r". \1" , sentences )
82105
83106    # No break after a single letter other than I 
84-     sentences  =  re .sub (r "(\b[A-HJ-Z]\.)\n " , r"\1 " , sentences )
107+     sentences  =  re .sub (rf "(\b[A-HJ-Z]\.){ separator }  " , r"\1 " , sentences )
85108
86109    # No break before coordinating conjunctions (CC) 
87110    coordinating_conjunctions  =  ["and" , "or" , "but" , "nor" , "yet" ]
88111    for  cc  in  coordinating_conjunctions :
89-         sentences  =  re .sub (r"\n("   +   cc   +   r"  )" , r" \1" , sentences )
112+         sentences  =  re .sub (rf" { separator } ( { cc } \s )" , r" \1" , sentences )
90113
91114    # No break before prepositions (IN) 
92115    prepositions  =  [
@@ -115,12 +138,12 @@ def postproc_splits(sentences):
115138        "whether" ,
116139    ]
117140    for  prep  in  prepositions :
118-         sentences  =  re .sub (r"\n("   +   prep   +   r"  )" , r" \1" , sentences )
141+         sentences  =  re .sub (rf" { separator } ( { prep } \s )" , r" \1" , sentences )
119142
120143    # No sentence breaks in the middle of specific abbreviations 
121-     sentences  =  re .sub (r "(\be\.)\n (g\.)" , r"\1 \2" , sentences )
122-     sentences  =  re .sub (r "(\bi\.)\n (e\.)" , r"\1 \2" , sentences )
123-     sentences  =  re .sub (r "(\bi\.)\n (v\.)" , r"\1 \2" , sentences )
144+     sentences  =  re .sub (rf "(\be\.){ separator }  (g\.)" , r"\1 \2" , sentences )
145+     sentences  =  re .sub (rf "(\bi\.){ separator }  (e\.)" , r"\1 \2" , sentences )
146+     sentences  =  re .sub (rf "(\bi\.){ separator }  (v\.)" , r"\1 \2" , sentences )
124147
125148    # No sentence break after specific abbreviations 
126149    abbreviations  =  [
@@ -169,13 +192,15 @@ def postproc_splits(sentences):
169192        r"i\. ?e\." ,
170193    ]
171194    for  abbr  in  abbreviations :
172-         sentences  =  re .sub (r"(\b"  +  abbr  +  r")\n" , r"\1 " , sentences )
195+         sentences  =  re .sub (
196+             rf"(\b{ abbr }  ){ separator }  " , r"\1" , sentences , flags = re .IGNORECASE 
197+         )
173198
174199    return  sentences 
175200
176201
177- # Original split sentences function from rulebased_split_sentences 
178- def   split_sentences ( text ): 
179-     text  =  re .sub (r"([?!.])(\s)? " , r "\1\n " , text )
180-     text  =  postproc_splits (text )
181-     return  text .split ("\n "  )
202+ def   split_sentences ( text ,  separator = "abcdsentenceseperatordcba" ): 
203+      # Use the separator in the regex 
204+     text  =  re .sub (r"([?!.])(?=\s|$) " , rf "\1{ separator }  " , text )
205+     text  =  postproc_splits (text ,  separator )
206+     return  re .split (rf "\n? { separator }  ?\n?" ,  text )
0 commit comments