Skip to content

Commit 6f8b175

Browse files
Fix regex patterns for abbreviations in tokenization_utils_seperator.py
1 parent 6462558 commit 6f8b175

File tree

3 files changed

+9
-9
lines changed

3 files changed

+9
-9
lines changed

guardrails/utils/tokenization_utils_seperator.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -147,9 +147,9 @@ def postproc_splits(sentences, separator):
147147

148148
# No sentence break after specific abbreviations
149149
abbreviations = [
150-
r"e\.?g\.",
151-
r"i\.?e\.",
152-
r"i\.?v\.",
150+
r"e\. ?g\.",
151+
r"i\. ?e\.",
152+
r"i\. ?v\.",
153153
r"vs\.",
154154
r"cf\.",
155155
r"Dr\.",
@@ -165,11 +165,12 @@ def postproc_splits(sentences, separator):
165165
r"Fig\.",
166166
r"vol\.",
167167
r"Vols\.",
168-
r"No\.",
168+
r"no\.",
169169
r"Nos\.",
170170
r"et\.",
171171
r"al\.",
172-
r"Inc\.",
172+
r"i\. ?v\.",
173+
r"inc\.",
173174
r"Ltd\.",
174175
r"Co\.",
175176
r"Corp\.",
@@ -188,7 +189,7 @@ def postproc_splits(sentences, separator):
188189
r"sen\.",
189190
r"st\.",
190191
r"vs\.",
191-
r"i\.?e\.",
192+
r"i\. ?e\.",
192193
]
193194
for abbr in abbreviations:
194195
sentences = re.sub(

guardrails/validator_base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def split_sentence_word_tokenizers_jl_separator(
7575

7676
# check for potential line endings, which is what split_sentences does
7777
chunk_with_potential_line_endings, count = re.subn(
78-
r"([?!.])\s?", rf"\1{separator}", chunk
78+
r"([?!.])(?=\s|$)", rf"\1{separator}", chunk
7979
)
8080
any_potential_line_endings = count > 0
8181
if not is_minimum_length or not any_potential_line_endings:

tests/integration_tests/test_streaming.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -584,8 +584,7 @@ def test_fix_behavior_three_validators(mocker):
584584
assert (
585585
text
586586
== """"REDACTED!!, under purple!! bridges, roams,
587-
<LOCATION> hills, his home.
588-
dreams of fog, and salty air,
587+
<LOCATION> hills, his home.dreams of fog, and salty air,
589588
in his heart, he's always there."""
590589
)
591590
assert (

0 commit comments

Comments
 (0)