File tree Expand file tree Collapse file tree 2 files changed +21
-0
lines changed Expand file tree Collapse file tree 2 files changed +21
-0
lines changed Original file line number Diff line number Diff line change 1616 sent_tokenize ,
1717 word_tokenize ,
1818)
19+ from nltk .tokenize .simple import CharTokenizer
1920
2021
2122def load_stanford_segmenter ():
@@ -865,3 +866,21 @@ class ExtLangVars(punkt.PunktLanguageVars):
865866 )
866867 def test_sent_tokenize (self , sentences : str , expected : List [str ]):
867868 assert sent_tokenize (sentences ) == expected
869+
870+ def test_string_tokenizer (self ) -> None :
871+ sentence = "Hello there"
872+ tokenizer = CharTokenizer ()
873+ assert tokenizer .tokenize (sentence ) == list (sentence )
874+ assert list (tokenizer .span_tokenize (sentence )) == [
875+ (0 , 1 ),
876+ (1 , 2 ),
877+ (2 , 3 ),
878+ (3 , 4 ),
879+ (4 , 5 ),
880+ (5 , 6 ),
881+ (6 , 7 ),
882+ (7 , 8 ),
883+ (8 , 9 ),
884+ (9 , 10 ),
885+ (10 , 11 ),
886+ ]
Original file line number Diff line number Diff line change @@ -70,6 +70,8 @@ class CharTokenizer(StringTokenizer):
7070 is ever required directly, use ``for char in string``.
7171 """
7272
73+ _string = None
74+
7375 def tokenize (self , s ):
7476 return list (s )
7577
You can’t perform that action at this time.
0 commit comments