Skip to content

Commit 6462558

Browse files
added back modified seperator algo, fix for split sentence
1 parent 88477c7 commit 6462558

File tree

2 files changed

+257
-2
lines changed

2 files changed

+257
-2
lines changed
Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
# This file contains code adapted from the WordTokenizers.jl
2+
# https://github.com/JuliaText/WordTokenizers.jl project.
3+
# It is subject to the license terms in the Apache License file
4+
# found in the top-level directory of this distribution.
5+
# This file has been modified by Guardrails AI on September 27 2024.
6+
7+
import re
8+
9+
10+
def replace_til_no_change(input_text, pattern, replacement):
11+
while True:
12+
new_text = re.sub(pattern, replacement, input_text)
13+
if new_text == input_text:
14+
break
15+
input_text = new_text
16+
return input_text
17+
18+
19+
def postproc_splits(sentences, separator):
20+
"""
21+
Applies heuristic rules to repair sentence splitting errors.
22+
Developed for use as postprocessing for the GENIA sentence
23+
splitter on PubMed abstracts, with minor tweaks for
24+
full-text documents.
25+
26+
`sentences` should be a string, with line breaks on sentence boundaries.
27+
Returns a similar string, but more correct.
28+
29+
Based on
30+
https://github.com/ninjin/geniass/blob/master/geniass-postproc.pl
31+
Which is
32+
(c) 2010 Sampo Pyysalo. No rights reserved, i.e. do whatever you like with this.
33+
Which draws in part on heuristics included in Yoshimasa Tsuruoka's
34+
medss.pl script.
35+
"""
36+
37+
# Remove Windows line endings
38+
sentences = sentences.replace("\r", "")
39+
40+
# Breaks sometimes missing after "?", "safe" cases
41+
sentences = re.sub(
42+
r"\b([a-z]+\?)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
43+
)
44+
# Breaks sometimes missing after ".", "safe" cases
45+
sentences = re.sub(
46+
r"\b([a-z]+ \.)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
47+
)
48+
49+
# No breaks producing lines only containing sentence-ending punctuation
50+
sentences = re.sub(rf"{separator}([.!?]+){separator}", r"\1" + separator, sentences)
51+
52+
# No breaks inside parentheses/brackets
53+
sentences = replace_til_no_change(
54+
sentences,
55+
r"\[([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\]",
56+
r"[\1 \2]",
57+
)
58+
sentences = replace_til_no_change(
59+
sentences,
60+
r"\(([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\)",
61+
r"(\1 \2)",
62+
)
63+
# Standard mismatched with possible intervening
64+
sentences = replace_til_no_change(
65+
sentences,
66+
r"\[([^\[\]]{0,250})" + re.escape(separator) + r"([^\[\]]{0,250})\]",
67+
r"[\1 \2]",
68+
)
69+
sentences = replace_til_no_change(
70+
sentences,
71+
r"\(([^\(\)]{0,250})" + re.escape(separator) + r"([^\(\)]{0,250})\)",
72+
r"(\1 \2)",
73+
)
74+
75+
# Line breaks within quotes
76+
sentences = replace_til_no_change(
77+
sentences,
78+
r'"([^"\n]{0,250})' + re.escape(separator) + r'([^"\n]{0,250})"',
79+
r'"\1 \2"',
80+
)
81+
sentences = replace_til_no_change(
82+
sentences,
83+
r"'([^'\n]{0,250})" + re.escape(separator) + r"([^'\n]{0,250})'",
84+
r"'\1 \2'",
85+
)
86+
87+
# Nesting to depth one
88+
sentences = replace_til_no_change(
89+
sentences,
90+
r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})"
91+
+ re.escape(separator)
92+
+ r"((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]",
93+
r"[\1 \2]",
94+
)
95+
sentences = replace_til_no_change(
96+
sentences,
97+
r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})"
98+
+ re.escape(separator)
99+
+ r"((?:[^\(\)]|\([^\(\)]*\)){0,250})\)",
100+
r"(\1 \2)",
101+
)
102+
103+
# No break after periods followed by a non-uppercase "normal word"
104+
sentences = re.sub(rf"\.{separator}([a-z]{{3,}}[a-z-]*[ .:,])", r". \1", sentences)
105+
106+
# No break after a single letter other than I
107+
sentences = re.sub(rf"(\b[A-HJ-Z]\.){separator}", r"\1 ", sentences)
108+
109+
# No break before coordinating conjunctions (CC)
110+
coordinating_conjunctions = ["and", "or", "but", "nor", "yet"]
111+
for cc in coordinating_conjunctions:
112+
sentences = re.sub(rf"{separator}({cc}\s)", r" \1", sentences)
113+
114+
# No break before prepositions (IN)
115+
prepositions = [
116+
"of",
117+
"in",
118+
"by",
119+
"as",
120+
"on",
121+
"at",
122+
"to",
123+
"via",
124+
"for",
125+
"with",
126+
"that",
127+
"than",
128+
"from",
129+
"into",
130+
"upon",
131+
"after",
132+
"while",
133+
"during",
134+
"within",
135+
"through",
136+
"between",
137+
"whereas",
138+
"whether",
139+
]
140+
for prep in prepositions:
141+
sentences = re.sub(rf"{separator}({prep}\s)", r" \1", sentences)
142+
143+
# No sentence breaks in the middle of specific abbreviations
144+
sentences = re.sub(rf"(\be\.){separator}(g\.)", r"\1 \2", sentences)
145+
sentences = re.sub(rf"(\bi\.){separator}(e\.)", r"\1 \2", sentences)
146+
sentences = re.sub(rf"(\bi\.){separator}(v\.)", r"\1 \2", sentences)
147+
148+
# No sentence break after specific abbreviations
149+
abbreviations = [
150+
r"e\.?g\.",
151+
r"i\.?e\.",
152+
r"i\.?v\.",
153+
r"vs\.",
154+
r"cf\.",
155+
r"Dr\.",
156+
r"Mr\.",
157+
r"Ms\.",
158+
r"Mrs\.",
159+
r"Prof\.",
160+
r"Ph\.?D\.",
161+
r"Jr\.",
162+
r"St\.",
163+
r"Mt\.",
164+
r"etc\.",
165+
r"Fig\.",
166+
r"vol\.",
167+
r"Vols\.",
168+
r"No\.",
169+
r"Nos\.",
170+
r"et\.",
171+
r"al\.",
172+
r"Inc\.",
173+
r"Ltd\.",
174+
r"Co\.",
175+
r"Corp\.",
176+
r"Dept\.",
177+
r"est\.",
178+
r"Asst\.",
179+
r"approx\.",
180+
r"dr\.",
181+
r"fig\.",
182+
r"mr\.",
183+
r"mrs\.",
184+
r"ms\.",
185+
r"prof\.",
186+
r"rep\.",
187+
r"jr\.",
188+
r"sen\.",
189+
r"st\.",
190+
r"vs\.",
191+
r"i\.?e\.",
192+
]
193+
for abbr in abbreviations:
194+
sentences = re.sub(
195+
rf"(\b{abbr}){separator}", r"\1", sentences, flags=re.IGNORECASE
196+
)
197+
198+
return sentences
199+
200+
201+
def split_sentences(text, separator="abcdsentenceseperatordcba"):
202+
# Use the separator in the regex
203+
text = re.sub(r"([?!.])(?=\s|$)", rf"\1{separator}", text)
204+
text = postproc_splits(text, separator)
205+
return re.split(rf"\n?{separator} ?\n?", text)

guardrails/validator_base.py

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@
3333
from guardrails.utils.safe_get import safe_get
3434
from guardrails.utils.hub_telemetry_utils import HubTelemetry
3535
from guardrails.utils.tokenization_utils import postproc_splits
36+
from guardrails.utils.tokenization_utils_seperator import (
37+
postproc_splits as postproc_splits_separator,
38+
)
3639

3740

3841
### functions to get chunks ###
@@ -44,6 +47,51 @@ def split_sentence_str(chunk: str):
4447
return [fragments[0] + ".", ".".join(fragments[1:])]
4548

4649

50+
def split_sentence_word_tokenizers_jl_separator(
51+
chunk: str, separator: str = "abcdsentenceseperatordcba"
52+
):
53+
"""
54+
Use a sentence tokenizer to detect if at least one sentence is present in the chunk.
55+
We return the first sentence and the remaining chunks without the first sentence.
56+
57+
We perform the first step of WordTokenizers.jl's split_sentences function to
58+
detect possible sentence boundaries before calling the sentence tokenizer.
59+
60+
Args:
61+
chunk (str): The text to split into sentences.
62+
63+
Returns:
64+
List[str]: A list of two strings. The first string is the first sentence
65+
in the chunk. The second string is the remaining text in the chunk.
66+
"""
67+
# using the sentence tokenizer is expensive
68+
# we check for a . to avoid wastefully calling the tokenizer
69+
70+
# check at least 3 characters have been accumulated before splitting
71+
is_minimum_length = False
72+
with contextlib.suppress(IndexError):
73+
chunk[2]
74+
is_minimum_length = True
75+
76+
# check for potential line endings, which is what split_sentences does
77+
chunk_with_potential_line_endings, count = re.subn(
78+
r"([?!.])\s?", rf"\1{separator}", chunk
79+
)
80+
any_potential_line_endings = count > 0
81+
if not is_minimum_length or not any_potential_line_endings:
82+
return []
83+
84+
sentences = postproc_splits_separator(chunk_with_potential_line_endings, separator)
85+
sentences = re.split(rf"\n?{separator} ?\n?", sentences)
86+
# if not more than one sentence, we haven't accumulated enough for a validation
87+
if len(sentences) <= 1:
88+
return []
89+
90+
# return the sentence
91+
# then the remaining chunks that aren't finished accumulating
92+
return [sentences[0], "".join(sentences[1:])]
93+
94+
4795
def split_sentence_word_tokenizers_jl(chunk: str):
4896
"""
4997
Use a sentence tokenizer to detect if at least one sentence is present in the chunk.
@@ -69,7 +117,9 @@ def split_sentence_word_tokenizers_jl(chunk: str):
69117
is_minimum_length = True
70118

71119
# check for potential line endings, which is what split_sentences does
72-
chunk_with_potential_line_endings, count = re.subn(r"([?!.])(\s)?", r"\1\n", chunk)
120+
chunk_with_potential_line_endings, count = re.subn(
121+
r"([?!.])(?=\s|$)", r"\1\n", chunk
122+
)
73123
any_potential_line_endings = count > 0
74124
if not is_minimum_length or not any_potential_line_endings:
75125
return []
@@ -303,7 +353,7 @@ def _chunking_function(self, chunk: str) -> List[str]:
303353
Returns:
304354
list[str]: The text chunked into some subset.
305355
"""
306-
return split_sentence_word_tokenizers_jl(chunk)
356+
return split_sentence_word_tokenizers_jl_separator(chunk)
307357

308358
def validate_stream(
309359
self, chunk: Any, metadata: Dict[str, Any], **kwargs

0 commit comments

Comments
 (0)