Skip to content

Commit 23a5c16

Browse files
authored
Merge branch 'main' into main
2 parents 3c75dcc + a05b60c commit 23a5c16

File tree

22 files changed

+990
-57
lines changed

22 files changed

+990
-57
lines changed

.github/workflows/CI.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,14 +92,19 @@ jobs:
9292
platform:
9393
- runner: windows-latest
9494
target: x64
95+
architecture: x64
9596
- runner: windows-latest
9697
target: x86
98+
architecture: x86
99+
- runner: windows-11-arm
100+
target: aarch64
101+
architecture: arm64
97102
steps:
98103
- uses: actions/checkout@v4
99104
- uses: actions/setup-python@v5
100105
with:
101106
python-version: 3.x
102-
architecture: ${{ matrix.platform.target }}
107+
architecture: ${{ matrix.platform.architecture }}
103108
- name: Build wheels
104109
uses: PyO3/maturin-action@v1
105110
with:

.github/workflows/python.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ jobs:
108108
uses: actions-rs/cargo@v1
109109
with:
110110
command: audit
111-
args: -D warnings -f ./bindings/python/Cargo.lock --ignore RUSTSEC-2024-0436 --ignore RUSTSEC-2025-0014
111+
args: -D warnings -f ./bindings/python/Cargo.lock --ignore RUSTSEC-2024-0436 --ignore RUSTSEC-2025-0014 --ignore RUSTSEC-2025-0119 --ignore RUSTSEC-2024-0436
112112

113113
- name: Install
114114
working-directory: ./bindings/python

.github/workflows/rust.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,11 +94,17 @@ jobs:
9494
uses: actions-rs/cargo@v1
9595
with:
9696
command: audit
97-
args: -D warnings -f ./tokenizers/Cargo.lock --ignore RUSTSEC-2024-0436 --ignore RUSTSEC-2025-0014
97+
args: -D warnings -f ./tokenizers/Cargo.lock --ignore RUSTSEC-2024-0436 --ignore RUSTSEC-2025-0014 --ignore RUSTSEC-2025-0119
9898

9999
# Verify that Readme.md is up to date.
100100
- name: Make sure, Readme generated from lib.rs matches actual Readme
101101
if: matrix.os == 'ubuntu-latest'
102102
shell: bash
103103
working-directory: ./tokenizers
104104
run: cargo readme > must_match_readme.md && diff must_match_readme.md README.md
105+
106+
- name: Check semver
107+
if: matrix.os == 'ubuntu-latest'
108+
uses: obi1kenobi/cargo-semver-checks-action@v2
109+
with:
110+
manifest-path: ./tokenizers/Cargo.toml

bindings/node/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
authors = ["Nicolas Patry <[email protected]>"]
33
edition = "2021"
44
name = "node"
5-
version = "0.21.4-dev.0"
5+
version = "0.22.2-dev.0"
66

77
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
88

bindings/node/yarn.lock

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4405,14 +4405,14 @@ __metadata:
44054405
linkType: hard
44064406

44074407
"js-yaml@npm:^3.13.1, js-yaml@npm:^3.14.1":
4408-
version: 3.14.1
4409-
resolution: "js-yaml@npm:3.14.1"
4408+
version: 3.14.2
4409+
resolution: "js-yaml@npm:3.14.2"
44104410
dependencies:
44114411
argparse: ^1.0.7
44124412
esprima: ^4.0.0
44134413
bin:
44144414
js-yaml: bin/js-yaml.js
4415-
checksum: bef146085f472d44dee30ec34e5cf36bf89164f5d585435a3d3da89e52622dff0b188a580e4ad091c3341889e14cb88cac6e4deb16dc5b1e9623bb0601fc255c
4415+
checksum: 626fc207734a3452d6ba84e1c8c226240e6d431426ed94d0ab043c50926d97c509629c08b1d636f5d27815833b7cfd225865631da9fb33cb957374490bf3e90b
44164416
languageName: node
44174417
linkType: hard
44184418

bindings/python/Cargo.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "tokenizers-python"
3-
version = "0.21.4-dev.0"
3+
version = "0.22.2-dev.0"
44
authors = ["Anthony MOI <[email protected]>"]
55
edition = "2021"
66

@@ -15,6 +15,9 @@ serde_json = "1.0"
1515
libc = "0.2"
1616
env_logger = "0.11"
1717
pyo3 = { version = "0.25", features = ["abi3", "abi3-py39", "py-clone"] }
18+
pyo3-async-runtimes = { version = "0.25", features = ["tokio-runtime"] }
19+
tokio = { version = "1.47.1", features = ["rt", "rt-multi-thread", "macros", "signal"] }
20+
once_cell = "1.19.0"
1821
numpy = "0.25"
1922
ndarray = "0.16"
2023
itertools = "0.14"

bindings/python/py_src/tokenizers/__init__.pyi

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -725,6 +725,130 @@ class Tokenizer:
725725
"""
726726
pass
727727

728+
def async_decode_batch(self, sequences, skip_special_tokens=True):
729+
"""
730+
Decode a batch of ids back to their corresponding string
731+
732+
Args:
733+
sequences (:obj:`List` of :obj:`List[int]`):
734+
The batch of sequences we want to decode
735+
736+
skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
737+
Whether the special tokens should be removed from the decoded strings
738+
739+
Returns:
740+
:obj:`List[str]`: A list of decoded strings
741+
"""
742+
pass
743+
744+
def async_encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
745+
"""
746+
Asynchronously encode the given input with character offsets.
747+
748+
This is an async version of encode that can be awaited in async Python code.
749+
750+
Example:
751+
Here are some examples of the inputs that are accepted::
752+
753+
await async_encode("A single sequence")
754+
755+
Args:
756+
sequence (:obj:`~tokenizers.InputSequence`):
757+
The main input sequence we want to encode. This sequence can be either raw
758+
text or pre-tokenized, according to the ``is_pretokenized`` argument:
759+
760+
- If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
761+
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
762+
763+
pair (:obj:`~tokenizers.InputSequence`, `optional`):
764+
An optional input sequence. The expected format is the same that for ``sequence``.
765+
766+
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
767+
Whether the input is already pre-tokenized
768+
769+
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
770+
Whether to add the special tokens
771+
772+
Returns:
773+
:class:`~tokenizers.Encoding`: The encoded result
774+
775+
"""
776+
pass
777+
778+
def async_encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
779+
"""
780+
Asynchronously encode the given batch of inputs with character offsets.
781+
782+
This is an async version of encode_batch that can be awaited in async Python code.
783+
784+
Example:
785+
Here are some examples of the inputs that are accepted::
786+
787+
await async_encode_batch([
788+
"A single sequence",
789+
("A tuple with a sequence", "And its pair"),
790+
[ "A", "pre", "tokenized", "sequence" ],
791+
([ "A", "pre", "tokenized", "sequence" ], "And its pair")
792+
])
793+
794+
Args:
795+
input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
796+
A list of single sequences or pair sequences to encode. Each sequence
797+
can be either raw text or pre-tokenized, according to the ``is_pretokenized``
798+
argument:
799+
800+
- If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
801+
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
802+
803+
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
804+
Whether the input is already pre-tokenized
805+
806+
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
807+
Whether to add the special tokens
808+
809+
Returns:
810+
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
811+
812+
"""
813+
pass
814+
815+
def async_encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
816+
"""
817+
Asynchronously encode the given batch of inputs without tracking character offsets.
818+
819+
This is an async version of encode_batch_fast that can be awaited in async Python code.
820+
821+
Example:
822+
Here are some examples of the inputs that are accepted::
823+
824+
await async_encode_batch_fast([
825+
"A single sequence",
826+
("A tuple with a sequence", "And its pair"),
827+
[ "A", "pre", "tokenized", "sequence" ],
828+
([ "A", "pre", "tokenized", "sequence" ], "And its pair")
829+
])
830+
831+
Args:
832+
input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
833+
A list of single sequences or pair sequences to encode. Each sequence
834+
can be either raw text or pre-tokenized, according to the ``is_pretokenized``
835+
argument:
836+
837+
- If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
838+
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
839+
840+
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
841+
Whether the input is already pre-tokenized
842+
843+
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
844+
Whether to add the special tokens
845+
846+
Returns:
847+
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
848+
849+
"""
850+
pass
851+
728852
def decode(self, ids, skip_special_tokens=True):
729853
"""
730854
Decode the given list of ids back to a string

bindings/python/py_src/tokenizers/implementations/base_tokenizer.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ def normalize(self, sequence: str) -> str:
187187
Returns:
188188
The normalized string
189189
"""
190-
return self._tokenizer.normalize(sequence)
190+
return self._tokenizer.normalizer.normalize_str(sequence)
191191

192192
def encode(
193193
self,
@@ -259,6 +259,47 @@ def encode_batch(
259259

260260
return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens)
261261

262+
async def async_encode_batch(
263+
self,
264+
inputs: List[EncodeInput],
265+
is_pretokenized: bool = False,
266+
add_special_tokens: bool = True,
267+
) -> List[Encoding]:
268+
"""Asynchronously encode a batch (tracks character offsets).
269+
270+
Args:
271+
inputs: A list of single or pair sequences to encode.
272+
is_pretokenized: Whether inputs are already pre-tokenized.
273+
add_special_tokens: Whether to add special tokens.
274+
275+
Returns:
276+
A list of Encoding.
277+
"""
278+
if inputs is None:
279+
raise ValueError("async_encode_batch: `inputs` can't be `None`")
280+
# Exposed by the Rust bindings via pyo3_async_runtimes::tokio::future_into_py
281+
return await self._tokenizer.async_encode_batch(inputs, is_pretokenized, add_special_tokens)
282+
283+
async def async_encode_batch_fast(
284+
self,
285+
inputs: List[EncodeInput],
286+
is_pretokenized: bool = False,
287+
add_special_tokens: bool = True,
288+
) -> List[Encoding]:
289+
"""Asynchronously encode a batch (no character offsets, faster).
290+
291+
Args:
292+
inputs: A list of single or pair sequences to encode.
293+
is_pretokenized: Whether inputs are already pre-tokenized.
294+
add_special_tokens: Whether to add special tokens.
295+
296+
Returns:
297+
A list of Encoding.
298+
"""
299+
if inputs is None:
300+
raise ValueError("async_encode_batch_fast: `inputs` can't be `None`")
301+
return await self._tokenizer.async_encode_batch_fast(inputs, is_pretokenized, add_special_tokens)
302+
262303
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
263304
"""Decode the given list of ids to a string sequence
264305

bindings/python/py_src/tokenizers/trainers/__init__.pyi

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,20 @@ class BpeTrainer(Trainer):
4545
highly repetitive tokens like `======` for wikipedia
4646
4747
"""
48+
def __init__(
49+
self,
50+
vocab_size=30000,
51+
min_frequency=0,
52+
show_progress=True,
53+
special_tokens=[],
54+
limit_alphabet=None,
55+
initial_alphabet=[],
56+
continuing_subword_prefix=None,
57+
end_of_word_suffix=None,
58+
max_token_length=None,
59+
words={},
60+
):
61+
pass
4862

4963
class UnigramTrainer(Trainer):
5064
"""
@@ -85,6 +99,7 @@ class UnigramTrainer(Trainer):
8599
vocab_size=8000,
86100
show_progress=True,
87101
special_tokens=[],
102+
initial_alphabet=[],
88103
shrinking_factor=0.75,
89104
unk_token=None,
90105
max_piece_length=16,
@@ -109,6 +124,8 @@ class WordLevelTrainer(Trainer):
109124
special_tokens (:obj:`List[Union[str, AddedToken]]`):
110125
A list of special tokens the model should know of.
111126
"""
127+
def __init__(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[]):
128+
pass
112129

113130
class WordPieceTrainer(Trainer):
114131
"""

bindings/python/pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,15 @@ classifiers = [
2323
]
2424
keywords = ["NLP", "tokenizer", "BPE", "transformer", "deep learning"]
2525
dynamic = ["description", "license", "readme", "version"]
26-
dependencies = ["huggingface_hub>=0.16.4,<1.0"]
26+
dependencies = ["huggingface_hub>=0.16.4,<2.0"]
2727

2828
[project.urls]
2929
Homepage = "https://github.com/huggingface/tokenizers"
3030
Source = "https://github.com/huggingface/tokenizers"
3131

3232

3333
[project.optional-dependencies]
34-
testing = ["pytest", "requests", "numpy", "datasets", "black==22.3", "ruff"]
34+
testing = ["pytest", "pytest-asyncio", "requests", "numpy", "datasets", "black==22.3", "ruff"]
3535
docs = ["sphinx", "sphinx_rtd_theme", "setuptools_rust"]
3636
dev = ["tokenizers[testing]"]
3737

0 commit comments

Comments
 (0)