11"""Test semchunk."""
22import semchunk
33
4+ import tiktoken
5+
46from helpers import GUTENBERG , initialize_test_token_counters
7+ from transformers import AutoTokenizer
58
69TEST_TOKEN_COUNTERS = (
7- 'emubert_transformers' ,
8- 'gpt4_tiktoken' ,
9- 'word' ,
10+ # 'emubert_transformers',
11+ # 'gpt4_tiktoken',
12+ # 'word',
1013 'char' ,
1114)
1215TEST_CHUNK_SIZES = (
@@ -153,7 +156,7 @@ def test_semchunk() -> None:
153156
154157 # Test using `tiktoken` tokenizers, encodings and a `transformers` tokenizer by name with `chunkerify()`.
155158 for name in ['cl100k_base' , 'gpt-4' , 'umarbutler/emubert' ]:
156- chunker = semchunk .chunkerify ('gpt-4' , 1 )
159+ chunker = semchunk .chunkerify (name , 1 )
157160 chunker (DETERMINISTIC_TEST_INPUT )
158161 if TEST_OFFSETS : chunker (DETERMINISTIC_TEST_INPUT , offsets = True )
159162
@@ -167,6 +170,14 @@ def test_semchunk() -> None:
167170
168171 assert error_raised
169172
173+ # Test using a `transformers` tokenizer directly.
174+ tokenizer = AutoTokenizer .from_pretrained ('umarbutler/emubert' )
175+ chunker = semchunk .chunkerify (tokenizer , 1 )
176+
177+ # Test using a `tiktoken` tokenizer directly.
178+ tokenizer = tiktoken .encoding_for_model ('gpt-4' )
179+ chunker = semchunk .chunkerify (tokenizer , 1 )
180+
170181 # Try enabling a progress bar.
171182 chunker ([DETERMINISTIC_TEST_INPUT , DETERMINISTIC_TEST_INPUT ], progress = True )
172183 chunker ([DETERMINISTIC_TEST_INPUT , DETERMINISTIC_TEST_INPUT ], offsets = True , progress = True )
0 commit comments