Skip to content

Commit d4147d5

Browse files
committed
Changed test model from isaacus/emubert to isaacus/kanon-tokenizer.
1 parent b164036 commit d4147d5

File tree

5 files changed

+10
-6
lines changed

5 files changed

+10
-6
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
## Changelog 🔄
22
All notable changes to `semchunk` will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
33

4+
## [3.1.2] - 2025-03-06
5+
### Changed
6+
- Changed test model from `isaacus/emubert` to `isaacus/kanon-tokenizer`.
7+
48
## [3.1.1] - 2025-02-18
59
### Added
610
- Added a note to the quickstart section of the README advising users to deduct the number of special tokens automatically added by their tokenizer from their chunk size. This note had been removed in version 3.0.0 but has been readded as it is unlikely to be obvious to users.

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,10 @@ text = 'The quick brown fox jumps over the lazy dog.'
4343
# OpenAI `tiktoken` encoding or Hugging Face model, or a custom tokenizer that has an `encode()`
4444
# method (like a `tiktoken`, `transformers` or `tokenizers` tokenizer) or a custom token counting
4545
# function that takes a text and returns the number of tokens in it.
46-
chunker = semchunk.chunkerify('isaacus/emubert', chunk_size) or \
46+
chunker = semchunk.chunkerify('isaacus/kanon-tokenizer', chunk_size) or \
4747
semchunk.chunkerify('gpt-4', chunk_size) or \
4848
semchunk.chunkerify('cl100k_base', chunk_size) or \
49-
semchunk.chunkerify(AutoTokenizer.from_pretrained('isaacus/emubert'), chunk_size) or \
49+
semchunk.chunkerify(AutoTokenizer.from_pretrained('isaacus/kanon-tokenizer'), chunk_size) or \
5050
semchunk.chunkerify(tiktoken.encoding_for_model('gpt-4'), chunk_size) or \
5151
semchunk.chunkerify(lambda text: len(text.split()), chunk_size)
5252

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "semchunk"
7-
version = "3.1.1"
7+
version = "3.1.2"
88
authors = [
99
{name="Isaacus", email="[email protected]"},
1010
{name="Umar Butler", email="[email protected]"},

tests/helpers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def initialize_test_token_counters() -> dict[str, Callable[[str], int]]:
3737
"""Initialize `tiktoken`, `transformers`, character and word token counters for testing purposes."""
3838

3939
gpt4_tiktoken_tokenizer = tiktoken.encoding_for_model('gpt-4').encode
40-
emubert_transformers_tokenizer = make_transformers_tokenizer(transformers.AutoTokenizer.from_pretrained('isaacus/emubert'))
40+
emubert_transformers_tokenizer = make_transformers_tokenizer(transformers.AutoTokenizer.from_pretrained('isaacus/kanon-tokenizer'))
4141

4242
def word_tokenizer(text: str) -> list[str]:
4343
"""Tokenize a text into words."""

tests/test_semchunk.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ def test_semchunk() -> None:
159159
assert error_raised
160160

161161
# Test using `tiktoken` tokenizers, encodings and a `transformers` tokenizer by name with `chunkerify()`.
162-
for name in ['cl100k_base', 'gpt-4', 'isaacus/emubert']:
162+
for name in ['cl100k_base', 'gpt-4', 'isaacus/kanon-tokenizer']:
163163
chunker = semchunk.chunkerify(name, 1)
164164
chunker(DETERMINISTIC_TEST_INPUT)
165165
if TEST_OFFSETS: chunker(DETERMINISTIC_TEST_INPUT, offsets = True)
@@ -175,7 +175,7 @@ def test_semchunk() -> None:
175175
assert error_raised
176176

177177
# Test using a `transformers` tokenizer directly.
178-
tokenizer = AutoTokenizer.from_pretrained('isaacus/emubert')
178+
tokenizer = AutoTokenizer.from_pretrained('isaacus/kanon-tokenizer')
179179
chunker = semchunk.chunkerify(tokenizer, 1)
180180

181181
# Test using a `tiktoken` tokenizer directly.

0 commit comments

Comments
 (0)