Skip to content

Commit 7636733

Browse files
authored
align minicpm preprocessing with original model inputs, make internvl preproc static (#1003)
1 parent 47089ba commit 7636733

File tree

3 files changed

+16
-6
lines changed

3 files changed

+16
-6
lines changed

optimum/intel/openvino/modeling_visual_language.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -793,6 +793,7 @@ def preprocess_inputs(
793793
image: Optional[Image] = None,
794794
processor: Optional[AutoImageProcessor] = None,
795795
tokenizer: Optional[PreTrainedTokenizer] = None,
796+
config: Optional[PretrainedConfig] = None,
796797
):
797798
"""
798799
Preprocess input instruction and an image.
@@ -969,6 +970,7 @@ def preprocess_inputs(
969970
image: Optional[Image] = None,
970971
processor: Optional[AutoImageProcessor] = None,
971972
tokenizer: Optional[PreTrainedTokenizer] = None,
973+
config: Optional[PretrainedConfig] = None,
972974
):
973975
if processor is None:
974976
raise ValueError("Processor is required.")
@@ -1282,12 +1284,13 @@ def merge_vision_text_embeddings(
12821284
input_embeds = input_embeds.reshape(B, N, C)
12831285
return input_embeds, attention_mask, position_ids
12841286

1287+
@staticmethod
12851288
def preprocess_inputs(
1286-
self,
12871289
text: str,
12881290
image: Optional[Image] = None,
12891291
processor: Optional[AutoImageProcessor] = None,
12901292
tokenizer: Optional[PreTrainedTokenizer] = None,
1293+
config: Optional[PretrainedConfig] = None,
12911294
):
12921295
if tokenizer is None:
12931296
raise ValueError("Tokenizer is required.")
@@ -1379,13 +1382,15 @@ def load_image(image, input_size=448, max_num=12):
13791382
return pixel_values
13801383

13811384
if image is not None:
1385+
if config is None:
1386+
raise ValueError("Config is required.")
13821387
if "<image>" not in text:
13831388
text = "<image>\n" + text
1384-
pixel_values = load_image(image, input_size=self.config.vision_config.image_size)
1389+
pixel_values = load_image(image, input_size=config.vision_config.image_size)
13851390
num_patches = pixel_values.shape[0]
13861391
num_image_token = int(
1387-
(self.config.vision_config.image_size // self.config.vision_config.patch_size) ** 2
1388-
* (self.config.downsample_ratio**2)
1392+
(config.vision_config.image_size // config.vision_config.patch_size) ** 2
1393+
* (config.downsample_ratio**2)
13891394
)
13901395
image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * num_image_token * num_patches + IMG_END_TOKEN
13911396
text = text.replace("<image>", image_tokens, 1)
@@ -1660,6 +1665,7 @@ def preprocess_inputs(
16601665
image: Optional[Image] = None,
16611666
processor: Optional[AutoImageProcessor] = None,
16621667
tokenizer: Optional[PreTrainedTokenizer] = None,
1668+
config: Optional[PretrainedConfig] = None,
16631669
):
16641670
if processor is None:
16651671
raise ValueError("Processor is required.")
@@ -1673,6 +1679,7 @@ def preprocess_inputs(
16731679
else text
16741680
)
16751681
inputs = processor([prompt], [image], return_tensors="pt")
1682+
inputs.pop("image_sizes", None)
16761683
return inputs
16771684

16781685

@@ -1853,6 +1860,7 @@ def preprocess_inputs(
18531860
image: Optional[Image] = None,
18541861
processor: Optional[AutoImageProcessor] = None,
18551862
tokenizer: Optional[PreTrainedTokenizer] = None,
1863+
config: Optional[PretrainedConfig] = None,
18561864
):
18571865
if tokenizer is None:
18581866
raise ValueError("Tokenizer is required.")
@@ -2012,6 +2020,7 @@ def preprocess_inputs(
20122020
image: Optional[Image] = None,
20132021
processor: Optional[AutoImageProcessor] = None,
20142022
tokenizer: Optional[PreTrainedTokenizer] = None,
2023+
config: Optional[PretrainedConfig] = None,
20152024
):
20162025
if processor is None:
20172026
raise ValueError("Processor is required.")

optimum/intel/openvino/quantization.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -785,7 +785,7 @@ def _prepare_visual_causal_lm_dataset(self, config: OVWeightQuantizationConfig):
785785

786786
try:
787787
inputs = self.model.preprocess_inputs(
788-
text=instruction, image=image, processor=processor, tokenizer=tokenizer
788+
text=instruction, image=image, processor=processor, tokenizer=tokenizer, config=self.model.config
789789
)
790790
except ValueError as value_error:
791791
if "Tokenizer is required." in str(value_error) and tokenizer_error is not None:

tests/openvino/test_modeling.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2165,10 +2165,11 @@ def get_preprocessors(self, model_arch):
21652165
)
21662166
preprocessors = {"processor": processor, "tokenizer": tokenizer}
21672167
elif model_arch == "internvl2":
2168+
config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
21682169
tokenizer = AutoTokenizer.from_pretrained(
21692170
model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
21702171
)
2171-
preprocessors = {"processor": None, "tokenizer": tokenizer}
2172+
preprocessors = {"processor": None, "tokenizer": tokenizer, "config": config}
21722173
else:
21732174
processor = AutoProcessor.from_pretrained(
21742175
model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS

0 commit comments

Comments
 (0)