17
17
MultiModalUUIDDict ,
18
18
)
19
19
from vllm .multimodal .processing import BaseMultiModalProcessor
20
- from vllm .transformers_utils .tokenizer import AnyTokenizer
20
+ from vllm .transformers_utils .tokenizer import AnyTokenizer , init_tokenizer_from_configs
21
+ from vllm .utils .jsontree import json_iter_leaves
21
22
22
23
from .data import (
23
24
DecoderOnlyInputs ,
@@ -44,17 +45,20 @@ class InputPreprocessor:
44
45
def __init__ (
45
46
self ,
46
47
model_config : ModelConfig ,
47
- tokenizer : Optional [AnyTokenizer ],
48
48
mm_registry : MultiModalRegistry = MULTIMODAL_REGISTRY ,
49
49
mm_processor_cache : Optional [BaseMultiModalProcessorCache ] = None ,
50
50
) -> None :
51
51
super ().__init__ ()
52
52
53
53
self .model_config = model_config
54
- self .tokenizer = tokenizer
55
54
self .mm_registry = mm_registry
56
55
self .mm_processor_cache = mm_processor_cache
57
56
57
+ if model_config .skip_tokenizer_init :
58
+ self .tokenizer = None
59
+ else :
60
+ self .tokenizer = init_tokenizer_from_configs (model_config )
61
+
58
62
def get_tokenizer (self ) -> AnyTokenizer :
59
63
if self .tokenizer is None :
60
64
raise ValueError (
@@ -273,7 +277,10 @@ def _process_multimodal(
273
277
mm_hashes = mm_input ["mm_hashes" ]
274
278
275
279
# Validate that all mm items have a string as their hash
276
- if not contains_only_strings (mm_hashes ):
280
+ contains_only_strings = all (
281
+ isinstance (leaf , str ) for leaf in json_iter_leaves (mm_hashes )
282
+ )
283
+ if not contains_only_strings :
277
284
raise ValueError (
278
285
f"mm_hashes must contain only strings, got: { mm_hashes } . "
279
286
"This is likely due to an incorrect custom implementation of "
@@ -693,15 +700,3 @@ def preprocess(
693
700
def clear_cache (self ) -> None :
694
701
if self .mm_processor_cache is not None :
695
702
self .mm_processor_cache .clear_cache ()
696
-
697
-
698
- # Helper function to validate that a nested dictionary contains
699
- # only strings or list of strings as the leaf values.
700
- def contains_only_strings (obj : object ):
701
- if isinstance (obj , str ):
702
- return True
703
- if isinstance (obj , list ):
704
- return all (isinstance (x , str ) for x in obj )
705
- if isinstance (obj , dict ):
706
- return all (contains_only_strings (v ) for v in obj .values ())
707
- return False
0 commit comments