Skip to content

Commit 0298730

Browse files
remove nltk import and download from validator base
1 parent bbef78a commit 0298730

File tree

1 file changed

+19
-25
lines changed

1 file changed

+19
-25
lines changed

guardrails/validator_base.py

Lines changed: 19 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
from warnings import warn
1616
import warnings
1717

18-
import nltk
1918
import requests
2019
from langchain_core.runnables import Runnable
2120

@@ -32,12 +31,6 @@
3231
from guardrails.utils.safe_get import safe_get
3332
from guardrails.utils.hub_telemetry_utils import HubTelemetry
3433

35-
# See: https://github.com/guardrails-ai/guardrails/issues/829
36-
try:
37-
nltk.data.find("tokenizers/punkt")
38-
except LookupError:
39-
nltk.download("punkt")
40-
4134

4235
### functions to get chunks ###
4336
def split_sentence_str(chunk: str):
@@ -48,24 +41,25 @@ def split_sentence_str(chunk: str):
4841
return [fragments[0] + ".", ".".join(fragments[1:])]
4942

5043

51-
def split_sentence_nltk(chunk: str):
52-
"""
53-
NOTE: this approach currently does not work
54-
Use a sentence tokenizer to split the chunk into sentences.
55-
56-
Because using the tokenizer is expensive, we only use it if there
57-
is a period present in the chunk.
58-
"""
59-
# using the sentence tokenizer is expensive
60-
# we check for a . to avoid wastefully calling the tokenizer
61-
if "." not in chunk:
62-
return []
63-
sentences = nltk.sent_tokenize(chunk)
64-
if len(sentences) == 0:
65-
return []
66-
# return the sentence
67-
# then the remaining chunks that aren't finished accumulating
68-
return [sentences[0], "".join(sentences[1:])]
44+
# TODO ensure this is not indeed needed
45+
# def split_sentence_nltk(chunk: str):
46+
# """
47+
# NOTE: this approach currently does not work
48+
# Use a sentence tokenizer to split the chunk into sentences.
49+
50+
# Because using the tokenizer is expensive, we only use it if there
51+
# is a period present in the chunk.
52+
# """
53+
# # using the sentence tokenizer is expensive
54+
# # we check for a . to avoid wastefully calling the tokenizer
55+
# if "." not in chunk:
56+
# return []
57+
# sentences = nltk.sent_tokenize(chunk)
58+
# if len(sentences) == 0:
59+
# return []
60+
# # return the sentence
61+
# # then the remaining chunks that aren't finished accumulating
62+
# return [sentences[0], "".join(sentences[1:])]
6963

7064

7165
# TODO: Can we remove dataclass? It was originally added to support pydantic 1.*

0 commit comments

Comments
 (0)