Skip to content

Commit 6743df2

Browse files
Added new version of split_sentence_str using new rule based sentence tokenization
1 parent 416f290 commit 6743df2

File tree

1 file changed

+43
-0
lines changed

1 file changed

+43
-0
lines changed

guardrails/validator_base.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@
44
# - [ ] Remove validator_base.py in 0.6.x
55

66
import asyncio
7+
import contextlib
78
from functools import partial
89
import inspect
910
import logging
1011
from collections import defaultdict
1112
from dataclasses import dataclass
13+
import re
1214
from string import Template
1315
from typing import Any, Callable, Dict, List, Optional, Type, TypeVar, Union
1416
from typing_extensions import deprecated
@@ -30,6 +32,7 @@
3032
from guardrails.types.on_fail import OnFailAction
3133
from guardrails.utils.safe_get import safe_get
3234
from guardrails.utils.hub_telemetry_utils import HubTelemetry
35+
from guardrails.utils.tokenization_utils import postproc_splits
3336

3437

3538
### functions to get chunks ###
@@ -41,6 +44,46 @@ def split_sentence_str(chunk: str):
4144
return [fragments[0] + ".", ".".join(fragments[1:])]
4245

4346

47+
def split_sentence_str_v2(chunk: str):
48+
"""
49+
Use a sentence tokenizer to detect if at least one sentence is present in the chunk.
50+
We return the first sentence and the remaining chunks without the first sentence.
51+
52+
We perform the first step of WordTokenizers.jl's split_sentences function to
53+
detect possible sentence boundaries before calling the sentence tokenizer.
54+
55+
Args:
56+
chunk (str): The text to split into sentences.
57+
58+
Returns:
59+
List[str]: A list of two strings. The first string is the first sentence
60+
in the chunk. The second string is the remaining text in the chunk.
61+
"""
62+
# using the sentence tokenizer is expensive
63+
# we check for a . to avoid wastefully calling the tokenizer
64+
65+
# check at least 3 characters have been accumulated before splitting
66+
is_minimum_length = False
67+
with contextlib.suppress(IndexError):
68+
chunk[2]
69+
is_minimum_length = True
70+
71+
# check for potential line endings, which is what split_sentences does
72+
chunk_with_potential_line_endings, count = re.subn(r"([?!.])\s", r"\1\n", chunk)
73+
any_potential_line_endings = count > 0
74+
if not is_minimum_length or not any_potential_line_endings:
75+
return []
76+
77+
sentences = postproc_splits(chunk_with_potential_line_endings).split("\n")
78+
# if not more than one sentence, we haven't accumulated enough for a validation
79+
if len(sentences) <= 1:
80+
return []
81+
82+
# return the sentence
83+
# then the remaining chunks that aren't finished accumulating
84+
return [sentences[0], "".join(sentences[1:])]
85+
86+
4487
# TODO ensure this is not indeed needed
4588
# def split_sentence_nltk(chunk: str):
4689
# """

0 commit comments

Comments
 (0)