44#   - [ ] Remove validator_base.py in 0.6.x 
55
66import  asyncio 
7+ import  contextlib 
78from  functools  import  partial 
89import  inspect 
910import  logging 
1011from  collections  import  defaultdict 
1112from  dataclasses  import  dataclass 
13+ import  re 
1214from  string  import  Template 
1315from  typing  import  Any , Callable , Dict , List , Optional , Type , TypeVar , Union 
1416from  typing_extensions  import  deprecated 
3032from  guardrails .types .on_fail  import  OnFailAction 
3133from  guardrails .utils .safe_get  import  safe_get 
3234from  guardrails .utils .hub_telemetry_utils  import  HubTelemetry 
35+ from  guardrails .utils .tokenization_utils  import  postproc_splits 
3336
3437
3538### functions to get chunks ### 
@@ -41,6 +44,46 @@ def split_sentence_str(chunk: str):
4144    return  [fragments [0 ] +  "." , "." .join (fragments [1 :])]
4245
4346
47+ def  split_sentence_str_v2 (chunk : str ):
48+     """ 
49+     Use a sentence tokenizer to detect if at least one sentence is present in the chunk. 
50+     We return the first sentence and the remaining chunks without the first sentence. 
51+ 
52+     We perform the first step of WordTokenizers.jl's split_sentences function to 
53+     detect possible sentence boundaries before calling the sentence tokenizer. 
54+ 
55+     Args: 
56+         chunk (str): The text to split into sentences. 
57+ 
58+     Returns: 
59+         List[str]: A list of two strings. The first string is the first sentence 
60+             in the chunk. The second string is the remaining text in the chunk. 
61+     """ 
62+     # using the sentence tokenizer is expensive 
63+     # we check for a . to avoid wastefully calling the tokenizer 
64+ 
65+     # check at least 3 characters have been accumulated before splitting 
66+     is_minimum_length  =  False 
67+     with  contextlib .suppress (IndexError ):
68+         chunk [2 ]
69+         is_minimum_length  =  True 
70+ 
71+     # check for potential line endings, which is what split_sentences does 
72+     chunk_with_potential_line_endings , count  =  re .subn (r"([?!.])\s" , r"\1\n" , chunk )
73+     any_potential_line_endings  =  count  >  0 
74+     if  not  is_minimum_length  or  not  any_potential_line_endings :
75+         return  []
76+ 
77+     sentences  =  postproc_splits (chunk_with_potential_line_endings ).split ("\n " )
78+     # if not more than one sentence, we haven't accumulated enough for a validation 
79+     if  len (sentences ) <=  1 :
80+         return  []
81+ 
82+     # return the sentence 
83+     # then the remaining chunks that aren't finished accumulating 
84+     return  [sentences [0 ], "" .join (sentences [1 :])]
85+ 
86+ 
4487# TODO ensure this is not indeed needed 
4588# def split_sentence_nltk(chunk: str): 
4689#     """ 
0 commit comments