Skip to content

Commit fb1d2d0

Browse files
reverted split sentence in validators base
1 parent 4ebef18 commit fb1d2d0

File tree

1 file changed

+3
-9
lines changed

1 file changed

+3
-9
lines changed

guardrails/validator_base.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,7 @@ def split_sentence_str(chunk: str):
4444
return [fragments[0] + ".", ".".join(fragments[1:])]
4545

4646

47-
def split_sentence_word_tokenizers_jl(
48-
chunk: str, separator: str = "SENTENCEBREAK"
49-
) -> List[str]:
47+
def split_sentence_word_tokenizers_jl(chunk: str):
5048
"""
5149
Use a sentence tokenizer to detect if at least one sentence is present in the chunk.
5250
We return the first sentence and the remaining chunks without the first sentence.
@@ -71,16 +69,12 @@ def split_sentence_word_tokenizers_jl(
7169
is_minimum_length = True
7270

7371
# check for potential line endings, which is what split_sentences does
74-
chunk_with_potential_line_endings, count = re.subn(
75-
r"([?!.])(?=\s|$)", rf"\1{separator}", chunk
76-
)
72+
chunk_with_potential_line_endings, count = re.subn(r"([?!.])\s", r"\1\n", chunk)
7773
any_potential_line_endings = count > 0
7874
if not is_minimum_length or not any_potential_line_endings:
7975
return []
8076

81-
sentences = postproc_splits(chunk_with_potential_line_endings, separator).split(
82-
"\n"
83-
)
77+
sentences = postproc_splits(chunk_with_potential_line_endings).split("\n")
8478
# if not more than one sentence, we haven't accumulated enough for a validation
8579
if len(sentences) <= 1:
8680
return []

0 commit comments

Comments
 (0)