Skip to content

Commit 13bf2bd

Browse files
authored
Add _string to CharTokenizer (nltk#3156)
1 parent fc53edb commit 13bf2bd

File tree

2 files changed

+21
-0
lines changed

2 files changed

+21
-0
lines changed

nltk/test/unit/test_tokenize.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
sent_tokenize,
1717
word_tokenize,
1818
)
19+
from nltk.tokenize.simple import CharTokenizer
1920

2021

2122
def load_stanford_segmenter():
@@ -865,3 +866,21 @@ class ExtLangVars(punkt.PunktLanguageVars):
865866
)
866867
def test_sent_tokenize(self, sentences: str, expected: List[str]):
867868
assert sent_tokenize(sentences) == expected
869+
870+
def test_string_tokenizer(self) -> None:
871+
sentence = "Hello there"
872+
tokenizer = CharTokenizer()
873+
assert tokenizer.tokenize(sentence) == list(sentence)
874+
assert list(tokenizer.span_tokenize(sentence)) == [
875+
(0, 1),
876+
(1, 2),
877+
(2, 3),
878+
(3, 4),
879+
(4, 5),
880+
(5, 6),
881+
(6, 7),
882+
(7, 8),
883+
(8, 9),
884+
(9, 10),
885+
(10, 11),
886+
]

nltk/tokenize/simple.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ class CharTokenizer(StringTokenizer):
7070
is ever required directly, use ``for char in string``.
7171
"""
7272

73+
_string = None
74+
7375
def tokenize(self, s):
7476
return list(s)
7577

0 commit comments

Comments
 (0)