Skip to content

Commit d2bf970

Browse files
authored
Tokenizer update (#2457)
1 parent a012112 commit d2bf970

File tree

7 files changed

+62
-18
lines changed

7 files changed

+62
-18
lines changed

paddleformers/transformers/auto/tokenizer.py

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,12 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515
import json
16-
import logging
1716
import os
1817
import warnings
1918
from typing import Dict, Optional, Union
2019

2120
import transformers as hf
22-
from transformers import PretrainedConfig
21+
from transformers import AutoConfig, PretrainedConfig
2322
from transformers.dynamic_module_utils import (
2423
get_class_from_dynamic_module,
2524
resolve_trust_remote_code,
@@ -40,10 +39,8 @@
4039
from transformers.utils import cached_file
4140

4241
from ...utils.download import DownloadSource, resolve_file_path
42+
from ...utils.log import logger
4343
from ..tokenizer_utils import PaddleTokenizerMixin
44-
from .configuration import AutoConfig
45-
46-
logger = logging.getLogger(__name__)
4744

4845

4946
def get_paddleformers_tokenizer_config(
@@ -158,7 +155,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
158155
download_hub = kwargs.get("download_hub", None)
159156
if download_hub is None:
160157
download_hub = os.environ.get("DOWNLOAD_SOURCE", "huggingface")
161-
logger.info(f"Using download source: {download_hub}")
162158
use_auth_token = kwargs.pop("use_auth_token", None)
163159
if use_auth_token is not None:
164160
warnings.warn(
@@ -216,7 +212,29 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
216212
kwargs["_commit_hash"] = tokenizer_config["_commit_hash"]
217213
config_tokenizer_class = tokenizer_config.get("tokenizer_class")
218214
else:
219-
tokenizer_config = get_paddleformers_tokenizer_config(pretrained_model_name_or_path, **kwargs)
215+
try:
216+
tokenizer_config = get_paddleformers_tokenizer_config(pretrained_model_name_or_path, **kwargs)
217+
except Exception as e:
218+
if any(
219+
keyword in str(e).lower()
220+
for keyword in ["not exist", "not found", "entrynotfound", "notexist", "does not appear"]
221+
):
222+
hf_link = f"https://huggingface.co/{pretrained_model_name_or_path}"
223+
modelscope_link = f"https://modelscope.cn/models/{pretrained_model_name_or_path}"
224+
encoded_model_name = pretrained_model_name_or_path.replace("/", "%2F")
225+
aistudio_link = f"https://aistudio.baidu.com/modelsoverview?sortBy=weight&q={encoded_model_name}"
226+
227+
raise ValueError(
228+
f"Unable to find {TOKENIZER_CONFIG_FILE} in the model repository '{pretrained_model_name_or_path}'. Please check:\n"
229+
f"The model repository ID is correct for your chosen source:\n"
230+
f" - Hugging Face Hub: {hf_link}\n"
231+
f" - ModelScope: {modelscope_link}\n"
232+
f" - AI Studio: {aistudio_link}\n"
233+
f"Note: The repository ID may differ between ModelScope, AI Studio, and Hugging Face Hub.\n"
234+
f"You are currently using the download source: {download_hub}. Please check the repository ID on the official website."
235+
) from None
236+
else:
237+
raise
220238
config_tokenizer_class = tokenizer_config.get("tokenizer_class")
221239

222240
tokenizer_auto_map = None

paddleformers/transformers/tokenizer_utils.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,14 +209,38 @@ def from_pretrained(
209209
download_hub=download_hub,
210210
local_files_only=local_files_only,
211211
)
212-
except Exception:
212+
except (FileNotFoundError, EnvironmentError):
213213
pass
214+
except Exception as e:
215+
raise e
214216
# 获得cache_dir的目录
215217
for file_id, file_path in resolved_vocab_files.items():
216218
if resolved_vocab_files[file_id] is not None:
217219
cache_dir = os.path.dirname(resolved_vocab_files[file_id])
218220
break
219221

222+
if not any(key in resolved_vocab_files for key in cls.vocab_files_names.keys()):
223+
hf_link = f"https://huggingface.co/{pretrained_model_name_or_path}"
224+
modelscope_link = f"https://modelscope.cn/models/{pretrained_model_name_or_path}"
225+
encoded_model_name = pretrained_model_name_or_path.replace("/", "%2F")
226+
aistudio_link = f"https://aistudio.baidu.com/modelsoverview?sortBy=weight&q={encoded_model_name}"
227+
228+
raise ValueError(
229+
f"No vocabulary files found for model '{pretrained_model_name_or_path}'. "
230+
f"Please check:\n"
231+
f"1. The model repository ID is correct for your chosen source:\n"
232+
f" - Hugging Face Hub: {hf_link}\n"
233+
f" - ModelScope: {modelscope_link}\n"
234+
f" - AI Studio: {aistudio_link}\n"
235+
f"2. You have permission to access this model repository\n"
236+
f"3. Network connection is working properly\n"
237+
f"4. Try clearing cache and downloading again\n"
238+
f"Expected vocabulary files: {list(cls.vocab_files_names.keys())}\n"
239+
f"Valid files found: {list(resolved_vocab_files.keys())}\n"
240+
f"Note: The repository ID may differ between ModelScope, AI Studio, and Hugging Face Hub.\n"
241+
f"You are currently using the download source: {download_hub}. Please check the repository ID on the official website."
242+
)
243+
220244
return super()._from_pretrained(
221245
resolved_vocab_files,
222246
pretrained_model_name_or_path,

paddleformers/utils/download/download.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ def resolve_file_path(
180180
cache_file_name = hf_try_to_load_from_cache(repo_id, filename, cache_dir, subfolder, revision, repo_type)
181181
if download_hub == DownloadSource.HUGGINGFACE and cache_file_name is _CACHED_NO_EXIST:
182182
cache_file_name = None
183-
if cache_file_name is not None:
183+
if cache_file_name is not None and os.path.exists(str(cache_file_name)):
184184
return cache_file_name
185185

186186
# download file from different origins

requirements-dev.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,5 @@ huggingface_hub>=0.19.2
1717
tiktoken
1818
tokenizers<=0.20.3; python_version<="3.8"
1919
tokenizers>=0.21,<0.22; python_version>"3.8"
20-
modelscope
20+
modelscope
21+
transformers

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,5 @@ ml_dtypes
1919
tokenizers<=0.20.3; python_version<="3.8"
2020
tokenizers>=0.21,<0.22; python_version>"3.8"
2121
omegaconf
22-
modelscope
22+
modelscope
23+
transformers

tests/transformers/qwen2/test_tokenizer.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,11 @@ def tearDown(self):
4040
shutil.rmtree(test_dir)
4141

4242
def test_slow_tokenizer_from_pretrained(self):
43-
tokenizer = Qwen2Tokenizer.from_pretrained(self.from_pretrained_id, from_aistudio=True)
43+
tokenizer = Qwen2Tokenizer.from_pretrained(self.from_pretrained_id, download_hub="aistudio")
4444
self.assertTrue(tokenizer is not None)
4545

4646
def test_slow_tokenizer_save_pretrained(self):
47-
tokenizer = Qwen2Tokenizer.from_pretrained(self.from_pretrained_id, from_aistudio=True)
47+
tokenizer = Qwen2Tokenizer.from_pretrained(self.from_pretrained_id)
4848
special_tokens_dict = {"additional_special_tokens": ["[ENT_START]", "[ENT_END]"]}
4949
tokenizer.add_special_tokens(special_tokens_dict)
5050
tokenizer.add_tokens(["new_word", "another_word"])
@@ -53,11 +53,11 @@ def test_slow_tokenizer_save_pretrained(self):
5353
self.assertTrue(os.path.exists("./slow_tokenizer/tokenizer_config.json"))
5454

5555
def test_fast_tokenizer_from_pretrained(self):
56-
tokenizer = Qwen2TokenizerFast.from_pretrained(self.from_pretrained_id, from_aistudio=True)
56+
tokenizer = Qwen2TokenizerFast.from_pretrained(self.from_pretrained_id, download_hub="aistudio")
5757
self.assertTrue(tokenizer is not None)
5858

5959
def test_fast_tokenizer_save_pretrained(self):
60-
tokenizer = Qwen2TokenizerFast.from_pretrained(self.from_pretrained_id, from_aistudio=True)
60+
tokenizer = Qwen2TokenizerFast.from_pretrained(self.from_pretrained_id, download_hub="aistudio")
6161
special_tokens_dict = {"additional_special_tokens": ["[ENT_START]", "[ENT_END]"]}
6262
tokenizer.add_special_tokens(special_tokens_dict)
6363
tokenizer.add_tokens(["new_word", "another_word"])
@@ -66,7 +66,7 @@ def test_fast_tokenizer_save_pretrained(self):
6666
self.assertTrue(os.path.exists("./fast_tokenizer/tokenizer_config.json"))
6767

6868
def test_tokenize(self):
69-
tokenizer = Qwen2TokenizerFast.from_pretrained(self.from_pretrained_id, from_aistudio=True)
69+
tokenizer = Qwen2TokenizerFast.from_pretrained(self.from_pretrained_id, download_hub="aistudio")
7070
text = "hello world, this is a tokenizer test"
7171
output_dict = tokenizer(text)
7272
decode_text = tokenizer.decode(output_dict["input_ids"], skip_special_tokens=True)

tests/transformers/test_hf_tokenizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from paddleformers.transformers import AutoTokenizer, Qwen2Tokenizer
2020

2121

22-
@unittest.skip("multi source download CI not support")
22+
@unittest.skip("don't support multisource download")
2323
class TestHFMultiSourceTokenizer(unittest.TestCase):
2424
def encode(self, tokenizer):
2525
input_text = "hello world, 你好"
@@ -68,7 +68,7 @@ def test_auto_tokenizer(self):
6868

6969
class TestHFTokenizer(unittest.TestCase):
7070
def setUp(self):
71-
self.tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B", from_hf_hub=True)
71+
self.tokenizer = AutoTokenizer.from_pretrained("PaddleNLP/Qwen2.5-7B")
7272

7373
def test_encode(self):
7474
input_text = "hello world, this is paddle format checker"

0 commit comments

Comments
 (0)