huggingface
diff --git a/‎.github/workflows/CI.yml‎
Lines changed: 6 additions & 1 deletion b/‎.github/workflows/CI.yml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.github/workflows/python.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/python.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/rust.yml‎
Lines changed: 7 additions & 1 deletion b/‎.github/workflows/rust.yml‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎bindings/node/Cargo.toml‎
Lines changed: 1 addition & 1 deletion b/‎bindings/node/Cargo.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bindings/node/yarn.lock‎
Lines changed: 3 additions & 3 deletions b/‎bindings/node/yarn.lock‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎bindings/python/Cargo.toml‎
Lines changed: 4 additions & 1 deletion b/‎bindings/python/Cargo.toml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎bindings/python/py_src/tokenizers/__init__.pyi‎
Lines changed: 124 additions & 0 deletions b/‎bindings/python/py_src/tokenizers/__init__.pyi‎
Lines changed: 124 additions & 0 deletions
diff --git a/‎bindings/python/py_src/tokenizers/implementations/base_tokenizer.py‎
Lines changed: 42 additions & 1 deletion b/‎bindings/python/py_src/tokenizers/implementations/base_tokenizer.py‎
Lines changed: 42 additions & 1 deletion
diff --git a/‎bindings/python/py_src/tokenizers/trainers/__init__.pyi‎
Lines changed: 17 additions & 0 deletions b/‎bindings/python/py_src/tokenizers/trainers/__init__.pyi‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎bindings/python/pyproject.toml‎
Lines changed: 2 additions & 2 deletions b/‎bindings/python/pyproject.toml‎
Lines changed: 2 additions & 2 deletions
@@ -92,14 +92,19 @@ jobs:
         platform:
           - runner: windows-latest
             target: x64
+            architecture: x64
           - runner: windows-latest
             target: x86
+            architecture: x86
+          - runner: windows-11-arm
+            target: aarch64
+            architecture: arm64
     steps:
       - uses: actions/checkout@v4
       - uses: actions/setup-python@v5
         with:
           python-version: 3.x
-          architecture: ${{ matrix.platform.target }}
+          architecture: ${{ matrix.platform.architecture }}
       - name: Build wheels
         uses: PyO3/maturin-action@v1
         with:
 
@@ -108,7 +108,7 @@ jobs:
         uses: actions-rs/cargo@v1
         with:
           command: audit
-          args: -D warnings -f ./bindings/python/Cargo.lock  --ignore RUSTSEC-2024-0436 --ignore RUSTSEC-2025-0014
+          args: -D warnings -f ./bindings/python/Cargo.lock  --ignore RUSTSEC-2024-0436 --ignore RUSTSEC-2025-0014 --ignore RUSTSEC-2025-0119 --ignore RUSTSEC-2024-0436
 
       - name: Install
         working-directory: ./bindings/python
 
@@ -94,11 +94,17 @@ jobs:
         uses: actions-rs/cargo@v1
         with:
           command: audit
-          args: -D warnings -f ./tokenizers/Cargo.lock --ignore RUSTSEC-2024-0436 --ignore RUSTSEC-2025-0014
+          args: -D warnings -f ./tokenizers/Cargo.lock --ignore RUSTSEC-2024-0436 --ignore RUSTSEC-2025-0014 --ignore RUSTSEC-2025-0119
 
       # Verify that Readme.md is up to date.
       - name: Make sure, Readme generated from lib.rs matches actual Readme
         if: matrix.os == 'ubuntu-latest'
         shell: bash
         working-directory: ./tokenizers
         run: cargo readme > must_match_readme.md && diff must_match_readme.md README.md
+
+      - name: Check semver
+        if: matrix.os == 'ubuntu-latest'
+        uses: obi1kenobi/cargo-semver-checks-action@v2
+        with:
+          manifest-path: ./tokenizers/Cargo.toml
@@ -2,7 +2,7 @@
 authors = ["Nicolas Patry <[email protected]>"]
 edition = "2021"
 name    = "node"
-version = "0.21.4-dev.0"
+version = "0.22.2-dev.0"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 
@@ -4405,14 +4405,14 @@ __metadata:
   linkType: hard
 
 "js-yaml@npm:^3.13.1, js-yaml@npm:^3.14.1":
-  version: 3.14.1
-  resolution: "js-yaml@npm:3.14.1"
+  version: 3.14.2
+  resolution: "js-yaml@npm:3.14.2"
   dependencies:
     argparse: ^1.0.7
     esprima: ^4.0.0
   bin:
     js-yaml: bin/js-yaml.js
-  checksum: bef146085f472d44dee30ec34e5cf36bf89164f5d585435a3d3da89e52622dff0b188a580e4ad091c3341889e14cb88cac6e4deb16dc5b1e9623bb0601fc255c
+  checksum: 626fc207734a3452d6ba84e1c8c226240e6d431426ed94d0ab043c50926d97c509629c08b1d636f5d27815833b7cfd225865631da9fb33cb957374490bf3e90b
   languageName: node
   linkType: hard
 
 
@@ -1,6 +1,6 @@
 [package]
 name = "tokenizers-python"
-version = "0.21.4-dev.0"
+version = "0.22.2-dev.0"
 authors = ["Anthony MOI <[email protected]>"]
 edition = "2021"
 
@@ -15,6 +15,9 @@ serde_json = "1.0"
 libc = "0.2"
 env_logger = "0.11"
 pyo3 = { version = "0.25", features = ["abi3", "abi3-py39", "py-clone"] }
+pyo3-async-runtimes = { version = "0.25", features = ["tokio-runtime"] }
+tokio = { version = "1.47.1", features = ["rt", "rt-multi-thread", "macros", "signal"] }
+once_cell = "1.19.0"
 numpy = "0.25"
 ndarray = "0.16"
 itertools = "0.14"
 
@@ -725,6 +725,130 @@ class Tokenizer:
         """
         pass
 
+    def async_decode_batch(self, sequences, skip_special_tokens=True):
+        """
+        Decode a batch of ids back to their corresponding string
+
+        Args:
+            sequences (:obj:`List` of :obj:`List[int]`):
+                The batch of sequences we want to decode
+
+            skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+                Whether the special tokens should be removed from the decoded strings
+
+        Returns:
+            :obj:`List[str]`: A list of decoded strings
+        """
+        pass
+
+    def async_encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
+        """
+        Asynchronously encode the given input with character offsets.
+
+        This is an async version of encode that can be awaited in async Python code.
+
+        Example:
+            Here are some examples of the inputs that are accepted::
+
+                await async_encode("A single sequence")
+
+        Args:
+            sequence (:obj:`~tokenizers.InputSequence`):
+                The main input sequence we want to encode. This sequence can be either raw
+                text or pre-tokenized, according to the ``is_pretokenized`` argument:
+
+                - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
+                - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
+
+            pair (:obj:`~tokenizers.InputSequence`, `optional`):
+                An optional input sequence. The expected format is the same that for ``sequence``.
+
+            is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+                Whether the input is already pre-tokenized
+
+            add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+                Whether to add the special tokens
+
+        Returns:
+            :class:`~tokenizers.Encoding`: The encoded result
+
+        """
+        pass
+
+    def async_encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
+        """
+        Asynchronously encode the given batch of inputs with character offsets.
+
+        This is an async version of encode_batch that can be awaited in async Python code.
+
+        Example:
+            Here are some examples of the inputs that are accepted::
+
+                await async_encode_batch([
+                    "A single sequence",
+                    ("A tuple with a sequence", "And its pair"),
+                    [ "A", "pre", "tokenized", "sequence" ],
+                    ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
+                ])
+
+        Args:
+            input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
+                A list of single sequences or pair sequences to encode. Each sequence
+                can be either raw text or pre-tokenized, according to the ``is_pretokenized``
+                argument:
+
+                - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
+                - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
+
+            is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+                Whether the input is already pre-tokenized
+
+            add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+                Whether to add the special tokens
+
+        Returns:
+            A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
+
+        """
+        pass
+
+    def async_encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
+        """
+        Asynchronously encode the given batch of inputs without tracking character offsets.
+
+        This is an async version of encode_batch_fast that can be awaited in async Python code.
+
+        Example:
+            Here are some examples of the inputs that are accepted::
+
+                await async_encode_batch_fast([
+                    "A single sequence",
+                    ("A tuple with a sequence", "And its pair"),
+                    [ "A", "pre", "tokenized", "sequence" ],
+                    ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
+                ])
+
+        Args:
+            input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
+                A list of single sequences or pair sequences to encode. Each sequence
+                can be either raw text or pre-tokenized, according to the ``is_pretokenized``
+                argument:
+
+                - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
+                - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
+
+            is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+                Whether the input is already pre-tokenized
+
+            add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+                Whether to add the special tokens
+
+        Returns:
+            A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
+
+        """
+        pass
+
     def decode(self, ids, skip_special_tokens=True):
         """
         Decode the given list of ids back to a string
 
@@ -187,7 +187,7 @@ def normalize(self, sequence: str) -> str:
         Returns:
             The normalized string
         """
-        return self._tokenizer.normalize(sequence)
+        return self._tokenizer.normalizer.normalize_str(sequence)
 
     def encode(
         self,
@@ -259,6 +259,47 @@ def encode_batch(
 
         return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens)
 
+    async def async_encode_batch(
+        self,
+        inputs: List[EncodeInput],
+        is_pretokenized: bool = False,
+        add_special_tokens: bool = True,
+    ) -> List[Encoding]:
+        """Asynchronously encode a batch (tracks character offsets).
+
+        Args:
+            inputs: A list of single or pair sequences to encode.
+            is_pretokenized: Whether inputs are already pre-tokenized.
+            add_special_tokens: Whether to add special tokens.
+
+        Returns:
+            A list of Encoding.
+        """
+        if inputs is None:
+            raise ValueError("async_encode_batch: `inputs` can't be `None`")
+        # Exposed by the Rust bindings via pyo3_async_runtimes::tokio::future_into_py
+        return await self._tokenizer.async_encode_batch(inputs, is_pretokenized, add_special_tokens)
+
+    async def async_encode_batch_fast(
+        self,
+        inputs: List[EncodeInput],
+        is_pretokenized: bool = False,
+        add_special_tokens: bool = True,
+    ) -> List[Encoding]:
+        """Asynchronously encode a batch (no character offsets, faster).
+
+        Args:
+            inputs: A list of single or pair sequences to encode.
+            is_pretokenized: Whether inputs are already pre-tokenized.
+            add_special_tokens: Whether to add special tokens.
+
+        Returns:
+            A list of Encoding.
+        """
+        if inputs is None:
+            raise ValueError("async_encode_batch_fast: `inputs` can't be `None`")
+        return await self._tokenizer.async_encode_batch_fast(inputs, is_pretokenized, add_special_tokens)
+
     def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
         """Decode the given list of ids to a string sequence
 
 
@@ -45,6 +45,20 @@ class BpeTrainer(Trainer):
             highly repetitive tokens like `======` for wikipedia
 
     """
+    def __init__(
+        self,
+        vocab_size=30000,
+        min_frequency=0,
+        show_progress=True,
+        special_tokens=[],
+        limit_alphabet=None,
+        initial_alphabet=[],
+        continuing_subword_prefix=None,
+        end_of_word_suffix=None,
+        max_token_length=None,
+        words={},
+    ):
+        pass
 
 class UnigramTrainer(Trainer):
     """
@@ -85,6 +99,7 @@ class UnigramTrainer(Trainer):
         vocab_size=8000,
         show_progress=True,
         special_tokens=[],
+        initial_alphabet=[],
         shrinking_factor=0.75,
         unk_token=None,
         max_piece_length=16,
@@ -109,6 +124,8 @@ class WordLevelTrainer(Trainer):
         special_tokens (:obj:`List[Union[str, AddedToken]]`):
             A list of special tokens the model should know of.
     """
+    def __init__(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[]):
+        pass
 
 class WordPieceTrainer(Trainer):
     """
 
@@ -23,15 +23,15 @@ classifiers = [
 ]
 keywords = ["NLP", "tokenizer", "BPE", "transformer", "deep learning"]
 dynamic = ["description", "license", "readme", "version"]
-dependencies = ["huggingface_hub>=0.16.4,<1.0"]
+dependencies = ["huggingface_hub>=0.16.4,<2.0"]
 
 [project.urls]
 Homepage = "https://github.com/huggingface/tokenizers"
 Source = "https://github.com/huggingface/tokenizers"
 
 
 [project.optional-dependencies]
-testing = ["pytest", "requests", "numpy", "datasets", "black==22.3", "ruff"]
+testing = ["pytest", "pytest-asyncio", "requests", "numpy", "datasets", "black==22.3", "ruff"]
 docs = ["sphinx", "sphinx_rtd_theme", "setuptools_rust"]
 dev = ["tokenizers[testing]"]