@@ -140,13 +140,33 @@ def get_paddleformers_tokenizer_config(
140
140
return result
141
141
142
142
143
- class AutoTokenizer (hf .AutoTokenizer ):
143
+ def _bind_paddle_mixin_if_available (tokenizer_class ):
144
+ """
145
+ Bind the PaddleTokenizerMixin if Paddle is available; otherwise, return the original class.
146
+
147
+ Args:
148
+ tokenizer_class: The original tokenizer class.
149
+
150
+ Returns:
151
+ The tokenizer class bound with PaddleTokenizerMixin, or the original class.
144
152
"""
145
- Adapted from transformers.AutoTokenizer.from_pretrained with modifications:
146
- 1. Added get_paddleformers_tokenizer_config() to extend tokenizer_config.json download source
147
- 2. Explicitly binds PaddleTokenizerMixin to the tokenizer class before final instantiation
153
+ return type (tokenizer_class .__name__ , (PaddleTokenizerMixin , tokenizer_class ), {})
148
154
149
- Note: This extends HuggingFace's standard tokenizer loading logic with PaddlePaddle integration.
155
+
156
+ class AutoTokenizer (hf .AutoTokenizer ):
157
+ """
158
+ Smart AutoTokenizer that automatically adapts based on available dependencies:
159
+
160
+ 1. **Multi-source support**: Supports HuggingFace, PaddleFormers, and other download sources
161
+ 2. **Conditional Paddle integration**: Automatically detects PaddlePaddle availability
162
+ 3. **Fallback compatibility**: Works seamlessly with or without Paddle dependencies
163
+ 4. **Enhanced functionality**: Extends HuggingFace's standard tokenizer loading logic
164
+
165
+ Features:
166
+ - Automatically binds PaddleTokenizerMixin when PaddlePaddle is available
167
+ - Falls back to pure Transformers mode when PaddlePaddle is not available
168
+ - Maintains full compatibility with all HuggingFace tokenizers
169
+ - Supports custom download sources through environment variables
150
170
"""
151
171
152
172
@classmethod
@@ -201,7 +221,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
201
221
202
222
if tokenizer_class is None :
203
223
raise ValueError (f"Tokenizer class { tokenizer_class_name } is not currently imported." )
204
- tokenizer_class = type (tokenizer_class .__name__ , (PaddleTokenizerMixin , tokenizer_class ), {})
224
+
225
+ # Bind PaddleTokenizerMixin
226
+ tokenizer_class = _bind_paddle_mixin_if_available (tokenizer_class )
205
227
return tokenizer_class .from_pretrained (pretrained_model_name_or_path , * inputs , ** kwargs )
206
228
207
229
# Next, let's try to use the tokenizer_config file to get the tokenizer class.
@@ -268,6 +290,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
268
290
or tokenizer_class_from_name (config_tokenizer_class + "Fast" ) is not None
269
291
)
270
292
)
293
+
271
294
if has_remote_code :
272
295
if use_fast and tokenizer_auto_map [1 ] is not None :
273
296
class_ref = tokenizer_auto_map [1 ]
@@ -285,11 +308,14 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
285
308
tokenizer_class = get_class_from_dynamic_module (class_ref , pretrained_model_name_or_path , ** kwargs )
286
309
_ = kwargs .pop ("code_revision" , None )
287
310
tokenizer_class .register_for_auto_class ()
288
- tokenizer_class = type (tokenizer_class .__name__ , (PaddleTokenizerMixin , tokenizer_class ), {})
311
+
312
+ # Bind PaddleTokenizerMixin
313
+ tokenizer_class = _bind_paddle_mixin_if_available (tokenizer_class )
289
314
return tokenizer_class .from_pretrained (
290
315
pretrained_model_name_or_path , * inputs , trust_remote_code = trust_remote_code , ** kwargs
291
316
)
292
317
elif config_tokenizer_class is not None :
318
+
293
319
tokenizer_class = None
294
320
if use_fast and not config_tokenizer_class .endswith ("Fast" ):
295
321
tokenizer_class_candidate = f"{ config_tokenizer_class } Fast"
@@ -301,7 +327,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
301
327
raise ValueError (
302
328
f"Tokenizer class { tokenizer_class_candidate } does not exist or is not currently imported."
303
329
)
304
- tokenizer_class = type (tokenizer_class .__name__ , (PaddleTokenizerMixin , tokenizer_class ), {})
330
+
331
+ # Bind PaddleTokenizerMixin
332
+ tokenizer_class = _bind_paddle_mixin_if_available (tokenizer_class )
305
333
return tokenizer_class .from_pretrained (pretrained_model_name_or_path , * inputs , ** kwargs )
306
334
307
335
# Otherwise we have to be creative.
@@ -321,15 +349,13 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
321
349
tokenizer_class_py , tokenizer_class_fast = TOKENIZER_MAPPING [type (config )]
322
350
323
351
if tokenizer_class_fast and (use_fast or tokenizer_class_py is None ):
324
- tokenizer_class_fast = type (
325
- tokenizer_class_fast .__name__ , (PaddleTokenizerMixin , tokenizer_class_fast ), {}
326
- )
352
+ # Bind PaddleTokenizerMixin
353
+ tokenizer_class_fast = _bind_paddle_mixin_if_available (tokenizer_class_fast )
327
354
return tokenizer_class_fast .from_pretrained (pretrained_model_name_or_path , * inputs , ** kwargs )
328
355
else :
329
356
if tokenizer_class_py is not None :
330
- tokenizer_class_py = type (
331
- tokenizer_class_py .__name__ , (PaddleTokenizerMixin , tokenizer_class_py ), {}
332
- )
357
+ # Bind PaddleTokenizerMixin
358
+ tokenizer_class_py = _bind_paddle_mixin_if_available (tokenizer_class_py )
333
359
return tokenizer_class_py .from_pretrained (pretrained_model_name_or_path , * inputs , ** kwargs )
334
360
else :
335
361
raise ValueError (
0 commit comments