Skip to content

Commit affb205

Browse files
removed older scripts
1 parent 49aa02d commit affb205

File tree

3 files changed

+58
-303
lines changed

3 files changed

+58
-303
lines changed

guardrails/utils/tokenization_utils.py

Lines changed: 55 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,15 @@
88

99

1010
def replace_til_no_change(input_text, pattern, replacement):
11-
while re.search(pattern, input_text):
12-
input_text = re.sub(pattern, replacement, input_text)
11+
while True:
12+
new_text = re.sub(pattern, replacement, input_text)
13+
if new_text == input_text:
14+
break
15+
input_text = new_text
1316
return input_text
1417

1518

16-
def postproc_splits(sentences):
19+
def postproc_splits(sentences, separator):
1720
"""
1821
Applies heuristic rules to repair sentence splitting errors.
1922
Developed for use as postprocessing for the GENIA sentence
@@ -30,63 +33,83 @@ def postproc_splits(sentences):
3033
Which draws in part on heuristics included in Yoshimasa Tsuruoka's
3134
medss.pl script.
3235
"""
36+
3337
# Remove Windows line endings
3438
sentences = sentences.replace("\r", "")
3539

3640
# Breaks sometimes missing after "?", "safe" cases
37-
sentences = re.sub(r"\b([a-z]+\?) ([A-Z][a-z]+)\b", r"\1\n\2", sentences)
38-
# Breaks sometimes missing after "." separated with extra space, "safe" cases
39-
sentences = re.sub(r"\b([a-z]+ \.) ([A-Z][a-z]+)\b", r"\1\n\2", sentences)
41+
sentences = re.sub(
42+
r"\b([a-z]+\?)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
43+
)
44+
# Breaks sometimes missing after ".", "safe" cases
45+
sentences = re.sub(
46+
r"\b([a-z]+ \.)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
47+
)
4048

4149
# No breaks producing lines only containing sentence-ending punctuation
42-
sentences = re.sub(r"\n([.!?]+)\n", r"\1\n", sentences)
50+
sentences = re.sub(rf"{separator}([.!?]+){separator}", r"\1" + separator, sentences)
4351

4452
# No breaks inside parentheses/brackets
45-
# Unlimited length for no intervening parentheses/brackets
4653
sentences = replace_til_no_change(
47-
sentences, r"\[([^\[\]\(\)]*)\n([^\[\]\(\)]*)\]", r"[\1 \2]"
54+
sentences,
55+
r"\[([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\]",
56+
r"[\1 \2]",
4857
)
4958
sentences = replace_til_no_change(
50-
sentences, r"\(([^\[\]\(\)]*)\n([^\[\]\(\)]*)\)", r"(\1 \2)"
59+
sentences,
60+
r"\(([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\)",
61+
r"(\1 \2)",
5162
)
5263
# Standard mismatched with possible intervening
5364
sentences = replace_til_no_change(
54-
sentences, r"\[([^\[\]]{0,250})\n([^\[\]]{0,250})\]", r"[\1 \2]"
65+
sentences,
66+
r"\[([^\[\]]{0,250})" + re.escape(separator) + r"([^\[\]]{0,250})\]",
67+
r"[\1 \2]",
5568
)
5669
sentences = replace_til_no_change(
57-
sentences, r"\(([^\(\)]{0,250})\n([^\(\)]{0,250})\)", r"(\1 \2)"
70+
sentences,
71+
r"\(([^\(\)]{0,250})" + re.escape(separator) + r"([^\(\)]{0,250})\)",
72+
r"(\1 \2)",
5873
)
5974

60-
# Guardrails mods for line breaks within quotes
75+
# Line breaks within quotes
6176
sentences = replace_til_no_change(
62-
sentences, r'"([^"\n]{0,250})\n([^"\n]{0,250})"', r'"\1 \2"'
77+
sentences,
78+
r'"([^"\n]{0,250})' + re.escape(separator) + r'([^"\n]{0,250})"',
79+
r'"\1 \2"',
6380
)
6481
sentences = replace_til_no_change(
65-
sentences, r"'([^'\n]{0,250})\n([^'\n]{0,250})'", r"'\1 \2'"
82+
sentences,
83+
r"'([^'\n]{0,250})" + re.escape(separator) + r"([^'\n]{0,250})'",
84+
r"'\1 \2'",
6685
)
6786

6887
# Nesting to depth one
6988
sentences = replace_til_no_change(
7089
sentences,
71-
r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})\n((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]",
90+
r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})"
91+
+ re.escape(separator)
92+
+ r"((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]",
7293
r"[\1 \2]",
7394
)
7495
sentences = replace_til_no_change(
7596
sentences,
76-
r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})\n((?:[^\(\)]|\([^\(\)]*\)){0,250})\)",
97+
r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})"
98+
+ re.escape(separator)
99+
+ r"((?:[^\(\)]|\([^\(\)]*\)){0,250})\)",
77100
r"(\1 \2)",
78101
)
79102

80103
# No break after periods followed by a non-uppercase "normal word"
81-
sentences = re.sub(r"\.\n([a-z]{3}[a-z-]*[ .:,])", r". \1", sentences)
104+
sentences = re.sub(rf"\.{separator}([a-z]{{3,}}[a-z-]*[ .:,])", r". \1", sentences)
82105

83106
# No break after a single letter other than I
84-
sentences = re.sub(r"(\b[A-HJ-Z]\.)\n", r"\1 ", sentences)
107+
sentences = re.sub(rf"(\b[A-HJ-Z]\.){separator}", r"\1 ", sentences)
85108

86109
# No break before coordinating conjunctions (CC)
87110
coordinating_conjunctions = ["and", "or", "but", "nor", "yet"]
88111
for cc in coordinating_conjunctions:
89-
sentences = re.sub(r"\n(" + cc + r" )", r" \1", sentences)
112+
sentences = re.sub(rf"{separator}({cc}\s)", r" \1", sentences)
90113

91114
# No break before prepositions (IN)
92115
prepositions = [
@@ -115,12 +138,12 @@ def postproc_splits(sentences):
115138
"whether",
116139
]
117140
for prep in prepositions:
118-
sentences = re.sub(r"\n(" + prep + r" )", r" \1", sentences)
141+
sentences = re.sub(rf"{separator}({prep}\s)", r" \1", sentences)
119142

120143
# No sentence breaks in the middle of specific abbreviations
121-
sentences = re.sub(r"(\be\.)\n(g\.)", r"\1 \2", sentences)
122-
sentences = re.sub(r"(\bi\.)\n(e\.)", r"\1 \2", sentences)
123-
sentences = re.sub(r"(\bi\.)\n(v\.)", r"\1 \2", sentences)
144+
sentences = re.sub(rf"(\be\.){separator}(g\.)", r"\1 \2", sentences)
145+
sentences = re.sub(rf"(\bi\.){separator}(e\.)", r"\1 \2", sentences)
146+
sentences = re.sub(rf"(\bi\.){separator}(v\.)", r"\1 \2", sentences)
124147

125148
# No sentence break after specific abbreviations
126149
abbreviations = [
@@ -169,13 +192,15 @@ def postproc_splits(sentences):
169192
r"i\. ?e\.",
170193
]
171194
for abbr in abbreviations:
172-
sentences = re.sub(r"(\b" + abbr + r")\n", r"\1 ", sentences)
195+
sentences = re.sub(
196+
rf"(\b{abbr}){separator}", r"\1", sentences, flags=re.IGNORECASE
197+
)
173198

174199
return sentences
175200

176201

177-
# Original split sentences function from rulebased_split_sentences
178-
def split_sentences(text):
179-
text = re.sub(r"([?!.])(\s)?", r"\1\n", text)
180-
text = postproc_splits(text)
181-
return text.split("\n")
202+
def split_sentences(text, separator="abcdsentenceseperatordcba"):
203+
# Use the separator in the regex
204+
text = re.sub(r"([?!.])(?=\s|$)", rf"\1{separator}", text)
205+
text = postproc_splits(text, separator)
206+
return re.split(rf"\n?{separator} ?\n?", text)

guardrails/utils/tokenization_utils_seperator.py

Lines changed: 0 additions & 206 deletions
This file was deleted.

0 commit comments

Comments
 (0)