Skip to content

Commit cc4f365

Browse files
committed
Sped up tests
1 parent dd6c44e commit cc4f365

File tree

1 file changed

+15
-4
lines changed

1 file changed

+15
-4
lines changed

tests/test_semchunk.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
"""Test semchunk."""
22
import semchunk
33

4+
import tiktoken
5+
46
from helpers import GUTENBERG, initialize_test_token_counters
7+
from transformers import AutoTokenizer
58

69
TEST_TOKEN_COUNTERS = (
7-
'emubert_transformers',
8-
'gpt4_tiktoken',
9-
'word',
10+
# 'emubert_transformers',
11+
# 'gpt4_tiktoken',
12+
# 'word',
1013
'char',
1114
)
1215
TEST_CHUNK_SIZES = (
@@ -153,7 +156,7 @@ def test_semchunk() -> None:
153156

154157
# Test using `tiktoken` tokenizers, encodings and a `transformers` tokenizer by name with `chunkerify()`.
155158
for name in ['cl100k_base', 'gpt-4', 'umarbutler/emubert']:
156-
chunker = semchunk.chunkerify('gpt-4', 1)
159+
chunker = semchunk.chunkerify(name, 1)
157160
chunker(DETERMINISTIC_TEST_INPUT)
158161
if TEST_OFFSETS: chunker(DETERMINISTIC_TEST_INPUT, offsets = True)
159162

@@ -167,6 +170,14 @@ def test_semchunk() -> None:
167170

168171
assert error_raised
169172

173+
# Test using a `transformers` tokenizer directly.
174+
tokenizer = AutoTokenizer.from_pretrained('umarbutler/emubert')
175+
chunker = semchunk.chunkerify(tokenizer, 1)
176+
177+
# Test using a `tiktoken` tokenizer directly.
178+
tokenizer = tiktoken.encoding_for_model('gpt-4')
179+
chunker = semchunk.chunkerify(tokenizer, 1)
180+
170181
# Try enabling a progress bar.
171182
chunker([DETERMINISTIC_TEST_INPUT, DETERMINISTIC_TEST_INPUT], progress = True)
172183
chunker([DETERMINISTIC_TEST_INPUT, DETERMINISTIC_TEST_INPUT], offsets = True, progress = True)

0 commit comments

Comments
 (0)