1515from warnings import warn
1616import warnings
1717
18- import nltk
1918import requests
2019from langchain_core .runnables import Runnable
2120
3231from guardrails .utils .safe_get import safe_get
3332from guardrails .utils .hub_telemetry_utils import HubTelemetry
3433
35- # See: https://github.com/guardrails-ai/guardrails/issues/829
36- try :
37- nltk .data .find ("tokenizers/punkt" )
38- except LookupError :
39- nltk .download ("punkt" )
40-
4134
4235### functions to get chunks ###
4336def split_sentence_str (chunk : str ):
@@ -48,24 +41,25 @@ def split_sentence_str(chunk: str):
4841 return [fragments [0 ] + "." , "." .join (fragments [1 :])]
4942
5043
51- def split_sentence_nltk (chunk : str ):
52- """
53- NOTE: this approach currently does not work
54- Use a sentence tokenizer to split the chunk into sentences.
55-
56- Because using the tokenizer is expensive, we only use it if there
57- is a period present in the chunk.
58- """
59- # using the sentence tokenizer is expensive
60- # we check for a . to avoid wastefully calling the tokenizer
61- if "." not in chunk :
62- return []
63- sentences = nltk .sent_tokenize (chunk )
64- if len (sentences ) == 0 :
65- return []
66- # return the sentence
67- # then the remaining chunks that aren't finished accumulating
68- return [sentences [0 ], "" .join (sentences [1 :])]
44+ # TODO ensure this is not indeed needed
45+ # def split_sentence_nltk(chunk: str):
46+ # """
47+ # NOTE: this approach currently does not work
48+ # Use a sentence tokenizer to split the chunk into sentences.
49+
50+ # Because using the tokenizer is expensive, we only use it if there
51+ # is a period present in the chunk.
52+ # """
53+ # # using the sentence tokenizer is expensive
54+ # # we check for a . to avoid wastefully calling the tokenizer
55+ # if "." not in chunk:
56+ # return []
57+ # sentences = nltk.sent_tokenize(chunk)
58+ # if len(sentences) == 0:
59+ # return []
60+ # # return the sentence
61+ # # then the remaining chunks that aren't finished accumulating
62+ # return [sentences[0], "".join(sentences[1:])]
6963
7064
7165# TODO: Can we remove dataclass? It was originally added to support pydantic 1.*
0 commit comments