Skip to content

Commit a3a0342

Browse files
authored
[feature] lazyimport-and-tokenizer (#2481)
1 parent a68fb12 commit a3a0342

File tree

12 files changed

+674
-42
lines changed

12 files changed

+674
-42
lines changed

paddleformers/quantization/quantization_config.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@
1616
import json
1717
from dataclasses import dataclass
1818

19-
from paddle.nn.quant.quantized_linear import _get_arch_info
19+
try:
20+
from paddle.nn.quant.quantized_linear import _get_arch_info
21+
except:
22+
_get_arch_info = None
2023

2124
quant_inference_mapping = {"avg": "abs_max", "abs_max_channel_wise": "abs_max_channel_wise", "abs_max": "abs_max"}
2225
fp8_format_mapping = {
@@ -114,7 +117,8 @@ def __init__(
114117
f"weight_quantize_algo:{weight_quantize_algo} not in supported list ['weight_only_int8', 'weight_only_int4', 'llm.int8', 'a8w8', 'nf4', 'fp4']"
115118
)
116119
if (
117-
(isinstance(weight_quantize_algo, dict) and "fp8linear" in weight_quantize_algo)
120+
_get_arch_info is not None
121+
and (isinstance(weight_quantize_algo, dict) and "fp8linear" in weight_quantize_algo)
118122
or weight_quantize_algo == "fp8linear"
119123
) and _get_arch_info() not in [89, 90]:
120124
raise RuntimeError("fp8Linear is only supported on NVIDIA Hopper GPUs.")

paddleformers/transformers/__init__.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@
1818
from typing import TYPE_CHECKING
1919
from ..utils.lazy_import import _LazyModule
2020

21-
from .download_utils import *
22-
2321
# from .auto.modeling import AutoModelForCausalLM
2422
import_structure = {
2523
"kto_criterion": [
@@ -38,14 +36,10 @@
3836
"is_chinese_char",
3937
"AddedToken",
4038
"normalize_chars",
41-
"tokenize_special_chars,convert_to_unicode,",
39+
"tokenize_special_chars",
40+
"convert_to_unicode",
4241
"PreTrainedTokenizer",
4342
],
44-
"tokenizer_utils_base": [
45-
"PaddingStrategy",
46-
"TextInput",
47-
"TensorType",
48-
],
4943
"attention_utils": ["create_bigbird_rand_mask_idx_list"],
5044
"tensor_parallel_utils": [],
5145
"configuration_utils": ["PretrainedConfig"],
@@ -90,6 +84,11 @@
9084
"AutoDiscriminator",
9185
"AutoModelForConditionalGeneration",
9286
],
87+
"tokenizer_utils_base": [
88+
"PaddingStrategy",
89+
"TextInput",
90+
"TensorType",
91+
],
9392
"auto.processing": ["AutoProcessor"],
9493
"auto.tokenizer": ["AutoTokenizer"],
9594
"deepseek_v2.configuration": ["DeepseekV2Config"],
@@ -322,6 +321,8 @@
322321
"Qwen3MoePretrainingCriterion",
323322
],
324323
"qwen3_moe.modeling_pp": ["Qwen3MoeForCausalLMPipe"],
324+
"ernie4_5vl.tokenizer": ["Ernie4_5_VLTokenizer"],
325+
"ernie4_5vl": [],
325326
"bert": [],
326327
"llama": [],
327328
"qwen2": [],
@@ -348,6 +349,7 @@
348349
tokenize_special_chars,
349350
convert_to_unicode,
350351
)
352+
from .tokenizer_utils_fast import PretrainedTokenizerFast
351353
from .processing_utils import ProcessorMixin
352354
from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin
353355
from .image_processing_utils import ImageProcessingMixin

paddleformers/transformers/auto/tokenizer.py

Lines changed: 40 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -140,13 +140,33 @@ def get_paddleformers_tokenizer_config(
140140
return result
141141

142142

143-
class AutoTokenizer(hf.AutoTokenizer):
143+
def _bind_paddle_mixin_if_available(tokenizer_class):
144+
"""
145+
Bind the PaddleTokenizerMixin if Paddle is available; otherwise, return the original class.
146+
147+
Args:
148+
tokenizer_class: The original tokenizer class.
149+
150+
Returns:
151+
The tokenizer class bound with PaddleTokenizerMixin, or the original class.
144152
"""
145-
Adapted from transformers.AutoTokenizer.from_pretrained with modifications:
146-
1. Added get_paddleformers_tokenizer_config() to extend tokenizer_config.json download source
147-
2. Explicitly binds PaddleTokenizerMixin to the tokenizer class before final instantiation
153+
return type(tokenizer_class.__name__, (PaddleTokenizerMixin, tokenizer_class), {})
148154

149-
Note: This extends HuggingFace's standard tokenizer loading logic with PaddlePaddle integration.
155+
156+
class AutoTokenizer(hf.AutoTokenizer):
157+
"""
158+
Smart AutoTokenizer that automatically adapts based on available dependencies:
159+
160+
1. **Multi-source support**: Supports HuggingFace, PaddleFormers, and other download sources
161+
2. **Conditional Paddle integration**: Automatically detects PaddlePaddle availability
162+
3. **Fallback compatibility**: Works seamlessly with or without Paddle dependencies
163+
4. **Enhanced functionality**: Extends HuggingFace's standard tokenizer loading logic
164+
165+
Features:
166+
- Automatically binds PaddleTokenizerMixin when PaddlePaddle is available
167+
- Falls back to pure Transformers mode when PaddlePaddle is not available
168+
- Maintains full compatibility with all HuggingFace tokenizers
169+
- Supports custom download sources through environment variables
150170
"""
151171

152172
@classmethod
@@ -201,7 +221,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
201221

202222
if tokenizer_class is None:
203223
raise ValueError(f"Tokenizer class {tokenizer_class_name} is not currently imported.")
204-
tokenizer_class = type(tokenizer_class.__name__, (PaddleTokenizerMixin, tokenizer_class), {})
224+
225+
# Bind PaddleTokenizerMixin
226+
tokenizer_class = _bind_paddle_mixin_if_available(tokenizer_class)
205227
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
206228

207229
# Next, let's try to use the tokenizer_config file to get the tokenizer class.
@@ -268,6 +290,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
268290
or tokenizer_class_from_name(config_tokenizer_class + "Fast") is not None
269291
)
270292
)
293+
271294
if has_remote_code:
272295
if use_fast and tokenizer_auto_map[1] is not None:
273296
class_ref = tokenizer_auto_map[1]
@@ -285,11 +308,14 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
285308
tokenizer_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
286309
_ = kwargs.pop("code_revision", None)
287310
tokenizer_class.register_for_auto_class()
288-
tokenizer_class = type(tokenizer_class.__name__, (PaddleTokenizerMixin, tokenizer_class), {})
311+
312+
# Bind PaddleTokenizerMixin
313+
tokenizer_class = _bind_paddle_mixin_if_available(tokenizer_class)
289314
return tokenizer_class.from_pretrained(
290315
pretrained_model_name_or_path, *inputs, trust_remote_code=trust_remote_code, **kwargs
291316
)
292317
elif config_tokenizer_class is not None:
318+
293319
tokenizer_class = None
294320
if use_fast and not config_tokenizer_class.endswith("Fast"):
295321
tokenizer_class_candidate = f"{config_tokenizer_class}Fast"
@@ -301,7 +327,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
301327
raise ValueError(
302328
f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
303329
)
304-
tokenizer_class = type(tokenizer_class.__name__, (PaddleTokenizerMixin, tokenizer_class), {})
330+
331+
# Bind PaddleTokenizerMixin
332+
tokenizer_class = _bind_paddle_mixin_if_available(tokenizer_class)
305333
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
306334

307335
# Otherwise we have to be creative.
@@ -321,15 +349,13 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
321349
tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
322350

323351
if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
324-
tokenizer_class_fast = type(
325-
tokenizer_class_fast.__name__, (PaddleTokenizerMixin, tokenizer_class_fast), {}
326-
)
352+
# Bind PaddleTokenizerMixin
353+
tokenizer_class_fast = _bind_paddle_mixin_if_available(tokenizer_class_fast)
327354
return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
328355
else:
329356
if tokenizer_class_py is not None:
330-
tokenizer_class_py = type(
331-
tokenizer_class_py.__name__, (PaddleTokenizerMixin, tokenizer_class_py), {}
332-
)
357+
# Bind PaddleTokenizerMixin
358+
tokenizer_class_py = _bind_paddle_mixin_if_available(tokenizer_class_py)
333359
return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
334360
else:
335361
raise ValueError(

paddleformers/transformers/configuration_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
from pathlib import Path
3030
from typing import Any, Dict, List, Optional, Tuple, Union
3131

32-
import paddle
3332
from huggingface_hub import hf_hub_download
3433
from huggingface_hub.utils import EntryNotFoundError
3534

@@ -581,6 +580,8 @@ def __init__(self, **kwargs):
581580
if "torch_dtype" in kwargs:
582581
self.dtype = kwargs.pop("torch_dtype")
583582
else:
583+
import paddle
584+
584585
self.dtype = kwargs.pop("dtype", paddle.get_default_dtype())
585586

586587
# Is decoder is used in encoder-decoder models to differentiate encoder from decoder
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
14+
import sys
15+
from typing import TYPE_CHECKING
16+
17+
from ...utils.lazy_import import _LazyModule
18+
19+
import_structure = {
20+
"tokenizer": ["Ernie4_5_VLTokenizer"],
21+
"configuration": [
22+
"Ernie4_5_VLMoEConfig",
23+
],
24+
}
25+
26+
if TYPE_CHECKING:
27+
from .configuration import *
28+
from .tokenizer import Ernie4_5_VLTokenizer
29+
else:
30+
sys.modules[__name__] = _LazyModule(
31+
__name__,
32+
globals()["__file__"],
33+
import_structure,
34+
module_spec=__spec__,
35+
)

0 commit comments

Comments
 (0)