Skip to content

Commit 8f425c0

Browse files
committed
Fixed typo in model name.
1 parent f1b629e commit 8f425c0

File tree

5 files changed

+11
-6
lines changed

5 files changed

+11
-6
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
## Changelog 🔄
22
All notable changes to `semchunk` will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
33

4+
## [3.0.3] - 2025-02-13
5+
### Fixed
6+
- Fixed `isaacus/emubert` mistakenly being set to `isaacus-dev/emubert` in the README and tests.
7+
48
## [3.0.2] - 2025-02-13
59
### Fixed
610
- Significantly sped up chunking very long texts with little to no variation in levels of whitespace used (fixing [#8](https://github.com/isaacus-dev/semchunk/issues/8)) and, in the process, also slightly improved overall performance.
@@ -117,6 +121,7 @@ All notable changes to `semchunk` will be documented here. This project adheres
117121
### Added
118122
- Added the `chunk()` function, which splits text into semantically meaningful chunks of a specified size as determined by a provided token counter.
119123

124+
[3.0.3]: https://github.com/isaacus-dev/semchunk/compare/v3.0.2...v3.0.3
120125
[3.0.2]: https://github.com/isaacus-dev/semchunk/compare/v3.0.1...v3.0.2
121126
[3.0.1]: https://github.com/isaacus-dev/semchunk/compare/v3.0.0...v3.0.1
122127
[3.0.0]: https://github.com/isaacus-dev/semchunk/compare/v2.2.2...v3.0.0

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,10 @@ text = 'The quick brown fox jumps over the lazy dog.'
4141
# OpenAI `tiktoken` encoding or Hugging Face model, or a custom tokenizer that has an `encode()`
4242
# method (like a `tiktoken`, `transformers` or `tokenizers` tokenizer) or a custom token counting
4343
# function that takes a text and returns the number of tokens in it.
44-
chunker = semchunk.chunkerify('isaacus-dev/emubert', chunk_size) or \
44+
chunker = semchunk.chunkerify('isaacus/emubert', chunk_size) or \
4545
semchunk.chunkerify('gpt-4', chunk_size) or \
4646
semchunk.chunkerify('cl100k_base', chunk_size) or \
47-
semchunk.chunkerify(AutoTokenizer.from_pretrained('isaacus-dev/emubert'), chunk_size) or \
47+
semchunk.chunkerify(AutoTokenizer.from_pretrained('isaacus/emubert'), chunk_size) or \
4848
semchunk.chunkerify(tiktoken.encoding_for_model('gpt-4'), chunk_size) or \
4949
semchunk.chunkerify(lambda text: len(text.split()), chunk_size)
5050

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "semchunk"
7-
version = "3.0.2"
7+
version = "3.0.3"
88
authors = [
99
{name="Isaacus", email="[email protected]"},
1010
{name="Umar Butler", email="[email protected]"},

tests/helpers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def initialize_test_token_counters() -> dict[str, Callable[[str], int]]:
3737
"""Initialize `tiktoken`, `transformers`, character and word token counters for testing purposes."""
3838

3939
gpt4_tiktoken_tokenizer = tiktoken.encoding_for_model('gpt-4').encode
40-
emubert_transformers_tokenizer = make_transformers_tokenizer(transformers.AutoTokenizer.from_pretrained('isaacus-dev/emubert'))
40+
emubert_transformers_tokenizer = make_transformers_tokenizer(transformers.AutoTokenizer.from_pretrained('isaacus/emubert'))
4141

4242
def word_tokenizer(text: str) -> list[str]:
4343
"""Tokenize a text into words."""

tests/test_semchunk.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ def test_semchunk() -> None:
159159
assert error_raised
160160

161161
# Test using `tiktoken` tokenizers, encodings and a `transformers` tokenizer by name with `chunkerify()`.
162-
for name in ['cl100k_base', 'gpt-4', 'isaacus-dev/emubert']:
162+
for name in ['cl100k_base', 'gpt-4', 'isaacus/emubert']:
163163
chunker = semchunk.chunkerify(name, 1)
164164
chunker(DETERMINISTIC_TEST_INPUT)
165165
if TEST_OFFSETS: chunker(DETERMINISTIC_TEST_INPUT, offsets = True)
@@ -175,7 +175,7 @@ def test_semchunk() -> None:
175175
assert error_raised
176176

177177
# Test using a `transformers` tokenizer directly.
178-
tokenizer = AutoTokenizer.from_pretrained('isaacus-dev/emubert')
178+
tokenizer = AutoTokenizer.from_pretrained('isaacus/emubert')
179179
chunker = semchunk.chunkerify(tokenizer, 1)
180180

181181
# Test using a `tiktoken` tokenizer directly.

0 commit comments

Comments
 (0)