From a5a474b8059cf2f0aca3dff17936b6d6552539fb Mon Sep 17 00:00:00 2001 From: Ubospica Date: Thu, 7 Aug 2025 02:31:21 -0400 Subject: [PATCH 1/2] finish Signed-off-by: Ubospica --- pyproject.toml | 5 +++-- python/xgrammar/py.typed | 0 2 files changed, 3 insertions(+), 2 deletions(-) create mode 100644 python/xgrammar/py.typed diff --git a/pyproject.toml b/pyproject.toml index 3b800221..0b958276 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,6 @@ requires-python = ">=3.8, <4" dependencies = [ "pydantic", "torch>=1.10.0", - "transformers>=4.38.0", "triton; platform_system == 'Linux' and platform_machine == 'x86_64'", "mlx-lm; platform_system == 'Darwin' and platform_machine == 'arm64'", "ninja", @@ -38,7 +37,7 @@ test = [ "tiktoken", # transformers==4.50.0 has error on MacOS. # https://github.com/huggingface/transformers/issues/36906 - "transformers<4.50.0; platform_system == 'Darwin'", + "transformers!=4.50.0", ] [build-system] @@ -78,6 +77,7 @@ sdist.include = [ "/cpp/**/*.h", "/include/**/*", "/python/xgrammar/**/*.py", + "/python/xgrammar/py.typed", # Third party files "/3rdparty/**/*", @@ -88,6 +88,7 @@ sdist.include = [ "/README.md", "/NOTICE", + # Tests "/tests/**/*", ] diff --git a/python/xgrammar/py.typed b/python/xgrammar/py.typed new file mode 100644 index 00000000..e69de29b From d4ca7108ffc660ae679ef207563df96e2c5e7893 Mon Sep 17 00:00:00 2001 From: Ubospica Date: Thu, 7 Aug 2025 05:52:56 -0400 Subject: [PATCH 2/2] try to fix Signed-off-by: Ubospica --- pyproject.toml | 5 ++-- python/xgrammar/tokenizer_info.py | 49 +++++++++++++++++++++++-------- 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0b958276..72ccd5e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,11 +33,11 @@ test = [ "huggingface-hub[cli]", "protobuf", "pytest", - "sentencepiece", - "tiktoken", # transformers==4.50.0 has error on MacOS. # https://github.com/huggingface/transformers/issues/36906 "transformers!=4.50.0", + "sentencepiece", + "tiktoken", ] [build-system] @@ -88,7 +88,6 @@ sdist.include = [ "/README.md", "/NOTICE", - # Tests "/tests/**/*", ] diff --git a/python/xgrammar/tokenizer_info.py b/python/xgrammar/tokenizer_info.py index 3eb0628f..92fcfbb2 100644 --- a/python/xgrammar/tokenizer_info.py +++ b/python/xgrammar/tokenizer_info.py @@ -2,18 +2,40 @@ import json from enum import Enum -from typing import Any, Dict, List, Optional, Union - -try: - import sentencepiece -except ImportError: - sentencepiece = None -try: - import tiktoken -except ImportError: - tiktoken = None - -from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast +from types import ModuleType +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + +if TYPE_CHECKING: + try: + import sentencepiece + except ImportError: + pass + + try: + import tiktoken + except ImportError: + pass + + try: + from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast + except ImportError: + pass +else: + try: + import sentencepiece + except ImportError: + sentencepiece = None + + try: + import tiktoken + except ImportError: + tiktoken = None + + try: + from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast + except ImportError: + PreTrainedTokenizerBase = Any + PreTrainedTokenizerFast = Any from .base import XGRObject, _core from .support import logging @@ -162,7 +184,7 @@ def from_huggingface( Parameters ---------- - tokenizer : PreTrainedTokenizerBase + tokenizer : transformers.PreTrainedTokenizerBase The huggingface tokenizer. vocab_size : Optional[int], default: None @@ -258,6 +280,7 @@ def from_huggingface( "stop_token_ids is neither provided by user nor found from the tokenizer. " "It will be automatically detected." ) + return TokenizerInfo( encoded_vocab, VocabType.RAW,