diff --git a/pyproject.toml b/pyproject.toml index 3b800221..72ccd5e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,6 @@ requires-python = ">=3.8, <4" dependencies = [ "pydantic", "torch>=1.10.0", - "transformers>=4.38.0", "triton; platform_system == 'Linux' and platform_machine == 'x86_64'", "mlx-lm; platform_system == 'Darwin' and platform_machine == 'arm64'", "ninja", @@ -34,11 +33,11 @@ test = [ "huggingface-hub[cli]", "protobuf", "pytest", - "sentencepiece", - "tiktoken", # transformers==4.50.0 has error on MacOS. # https://github.com/huggingface/transformers/issues/36906 - "transformers<4.50.0; platform_system == 'Darwin'", + "transformers!=4.50.0", + "sentencepiece", + "tiktoken", ] [build-system] @@ -78,6 +77,7 @@ sdist.include = [ "/cpp/**/*.h", "/include/**/*", "/python/xgrammar/**/*.py", + "/python/xgrammar/py.typed", # Third party files "/3rdparty/**/*", diff --git a/python/xgrammar/py.typed b/python/xgrammar/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/python/xgrammar/tokenizer_info.py b/python/xgrammar/tokenizer_info.py index 3eb0628f..92fcfbb2 100644 --- a/python/xgrammar/tokenizer_info.py +++ b/python/xgrammar/tokenizer_info.py @@ -2,18 +2,40 @@ import json from enum import Enum -from typing import Any, Dict, List, Optional, Union - -try: - import sentencepiece -except ImportError: - sentencepiece = None -try: - import tiktoken -except ImportError: - tiktoken = None - -from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast +from types import ModuleType +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + +if TYPE_CHECKING: + try: + import sentencepiece + except ImportError: + pass + + try: + import tiktoken + except ImportError: + pass + + try: + from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast + except ImportError: + pass +else: + try: + import sentencepiece + except ImportError: + sentencepiece = None + + try: + import tiktoken + except ImportError: + tiktoken = None + + try: + from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast + except ImportError: + PreTrainedTokenizerBase = Any + PreTrainedTokenizerFast = Any from .base import XGRObject, _core from .support import logging @@ -162,7 +184,7 @@ def from_huggingface( Parameters ---------- - tokenizer : PreTrainedTokenizerBase + tokenizer : transformers.PreTrainedTokenizerBase The huggingface tokenizer. vocab_size : Optional[int], default: None @@ -258,6 +280,7 @@ def from_huggingface( "stop_token_ids is neither provided by user nor found from the tokenizer. " "It will be automatically detected." ) + return TokenizerInfo( encoded_vocab, VocabType.RAW,