Fixed typo in model name.

umarbutler · umarbutler · commit 8f425c05ce03 · 2025-02-13T16:50:08.000+11:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,10 @@
 ## Changelog 🔄
 All notable changes to `semchunk` will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [3.0.3] - 2025-02-13
+### Fixed
+- Fixed `isaacus/emubert` mistakenly being set to `isaacus-dev/emubert` in the README and tests.
+
 ## [3.0.2] - 2025-02-13
 ### Fixed
 - Significantly sped up chunking very long texts with little to no variation in levels of whitespace used (fixing [#8](https://github.com/isaacus-dev/semchunk/issues/8)) and, in the process, also slightly improved overall performance.
@@ -117,6 +121,7 @@ All notable changes to `semchunk` will be documented here. This project adheres
 ### Added
 - Added the `chunk()` function, which splits text into semantically meaningful chunks of a specified size as determined by a provided token counter.
 
+[3.0.3]: https://github.com/isaacus-dev/semchunk/compare/v3.0.2...v3.0.3
 [3.0.2]: https://github.com/isaacus-dev/semchunk/compare/v3.0.1...v3.0.2
 [3.0.1]: https://github.com/isaacus-dev/semchunk/compare/v3.0.0...v3.0.1
 [3.0.0]: https://github.com/isaacus-dev/semchunk/compare/v2.2.2...v3.0.0
diff --git a/README.md b/README.md
@@ -41,10 +41,10 @@ text = 'The quick brown fox jumps over the lazy dog.'
 # OpenAI `tiktoken` encoding or Hugging Face model, or a custom tokenizer that has an `encode()`
 # method (like a `tiktoken`, `transformers` or `tokenizers` tokenizer) or a custom token counting
 # function that takes a text and returns the number of tokens in it.
-chunker = semchunk.chunkerify('isaacus-dev/emubert', chunk_size) or \
+chunker = semchunk.chunkerify('isaacus/emubert', chunk_size) or \
           semchunk.chunkerify('gpt-4', chunk_size) or \
           semchunk.chunkerify('cl100k_base', chunk_size) or \
-          semchunk.chunkerify(AutoTokenizer.from_pretrained('isaacus-dev/emubert'), chunk_size) or \
+          semchunk.chunkerify(AutoTokenizer.from_pretrained('isaacus/emubert'), chunk_size) or \
           semchunk.chunkerify(tiktoken.encoding_for_model('gpt-4'), chunk_size) or \
           semchunk.chunkerify(lambda text: len(text.split()), chunk_size)
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "semchunk"
-version = "3.0.2"
+version = "3.0.3"
 authors = [
     {name="Isaacus", email="support@isaacus.com"},
     {name="Umar Butler", email="umar@umar.au"},
diff --git a/tests/helpers.py b/tests/helpers.py
@@ -37,7 +37,7 @@ def initialize_test_token_counters() -> dict[str, Callable[[str], int]]:
     """Initialize `tiktoken`, `transformers`, character and word token counters for testing purposes."""
     
     gpt4_tiktoken_tokenizer = tiktoken.encoding_for_model('gpt-4').encode
-    emubert_transformers_tokenizer = make_transformers_tokenizer(transformers.AutoTokenizer.from_pretrained('isaacus-dev/emubert'))
+    emubert_transformers_tokenizer = make_transformers_tokenizer(transformers.AutoTokenizer.from_pretrained('isaacus/emubert'))
     
     def word_tokenizer(text: str) -> list[str]:
         """Tokenize a text into words."""
diff --git a/tests/test_semchunk.py b/tests/test_semchunk.py
@@ -159,7 +159,7 @@ def test_semchunk() -> None:
     assert error_raised
     
     # Test using `tiktoken` tokenizers, encodings and a `transformers` tokenizer by name with `chunkerify()`.
-    for name in ['cl100k_base', 'gpt-4', 'isaacus-dev/emubert']:
+    for name in ['cl100k_base', 'gpt-4', 'isaacus/emubert']:
         chunker = semchunk.chunkerify(name, 1)
         chunker(DETERMINISTIC_TEST_INPUT)
         if TEST_OFFSETS: chunker(DETERMINISTIC_TEST_INPUT, offsets = True)
@@ -175,7 +175,7 @@ def test_semchunk() -> None:
     assert error_raised
     
     # Test using a `transformers` tokenizer directly.
-    tokenizer = AutoTokenizer.from_pretrained('isaacus-dev/emubert')
+    tokenizer = AutoTokenizer.from_pretrained('isaacus/emubert')
     chunker = semchunk.chunkerify(tokenizer, 1)
     
     # Test using a `tiktoken` tokenizer directly.