- 
                Notifications
    You must be signed in to change notification settings 
- Fork 151
[OpenVINO]add support for minicpmv4/4_5 #1412
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 9 commits
4b70b7f
              7c64417
              4b245f1
              81f69be
              6974a3e
              bd8b41f
              6c0c617
              05fad58
              b4e2ce1
              39223a4
              bd1adbd
              6aec742
              f6d7750
              a96d993
              02a4acf
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -2625,12 +2625,22 @@ def with_behavior( | |
| """ | ||
| if isinstance(behavior, str) and not isinstance(behavior, MiniCPMVConfigBehavior): | ||
| behavior = MiniCPMVConfigBehavior(behavior) | ||
|  | ||
| model_mapping = {2.6: "llama", 4.0: "qwen2", 4.5: "qwen3"} | ||
|          | ||
| if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS: | ||
| return get_vlm_text_embeddings_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype) | ||
| return get_vlm_text_embeddings_config( | ||
| model_mapping[self._orig_config.version], | ||
| self._orig_config, | ||
| self.int_dtype, | ||
| self.float_dtype, | ||
| ) | ||
|  | ||
| if behavior == MiniCPMVConfigBehavior.LANGUAGE: | ||
| return get_vlm_text_generation_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype) | ||
| return get_vlm_text_generation_config( | ||
| model_mapping[self._orig_config.version], | ||
| self._orig_config, | ||
| self.int_dtype, | ||
| self.float_dtype, | ||
| ) | ||
|  | ||
| if behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS: | ||
| return self.__class__( | ||
|  | ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -497,7 +497,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase): | |
| if is_transformers_version(">", "4.49"): | ||
| SUPPORTED_ARCHITECTURES += ["gemma3", "smolvlm"] | ||
| if is_transformers_version(">=", "4.51"): | ||
| SUPPORTED_ARCHITECTURES += ["llama4"] | ||
| SUPPORTED_ARCHITECTURES += ["llama4", "minicpmv4", "minicpmv4_5"] | ||
| if is_transformers_version("<", "4.52"): | ||
| SUPPORTED_ARCHITECTURES += ["minicpmo"] | ||
|  | ||
|  | @@ -506,7 +506,17 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase): | |
| SUPPORTED_ARCHITECTURES = set(SUPPORTED_ARCHITECTURES) - {"llava-qwen2", "phi3_v", "phi4mm"} | ||
|  | ||
| TASK = "image-text-to-text" | ||
| REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"] | ||
| REMOTE_CODE_MODELS = [ | ||
| "internvl_chat", | ||
| "minicpmv", | ||
| "minicpmv4", | ||
| "minicpmv4_5", | ||
| "minicpmo", | ||
| "llava-qwen2", | ||
| "phi3_v", | ||
| "maira2", | ||
| "phi4mm", | ||
| ] | ||
|  | ||
| IMAGE = Image.open( | ||
| requests.get( | ||
|  | @@ -611,7 +621,7 @@ def test_compare_to_transformers(self, model_arch): | |
| self._check_device_and_request(ov_model, test_device, False) | ||
|  | ||
| # pytorch minicpmv and internvl_chat are not designed to be used via forward | ||
| if model_arch not in ["minicpmv", "minicpmo", "internvl_chat"]: | ||
| if model_arch not in ["minicpmv", "minicpmv4", "minicpmv4_5", "minicpmo", "internvl_chat"]: | ||
| set_seed(SEED) | ||
| ov_outputs = ov_model(**inputs) | ||
| set_seed(SEED) | ||
|  | @@ -656,7 +666,7 @@ def test_compare_to_transformers(self, model_arch): | |
| transformers_inputs["past_key_values"] = DynamicCache() | ||
|  | ||
| with torch.no_grad(): | ||
| if model_arch in ["minicpmo"]: | ||
| if model_arch in ["minicpmo", "minicpmv4", "minicpmv4_5"]: | ||
| # `generate` method for minicpmo requires tokenizer | ||
| tokenizer = AutoTokenizer.from_pretrained( | ||
| model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS | ||
|  | @@ -670,7 +680,7 @@ def test_compare_to_transformers(self, model_arch): | |
| transformers_outputs = transformers_outputs[1].sequences | ||
|  | ||
| # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them | ||
| if model_arch in ["minicpmv", "minicpmo", "internvl_chat"]: | ||
| if model_arch in ["minicpmv", "minicpmv4", "minicpmv4_5", "minicpmo", "internvl_chat"]: | ||
| ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :] | ||
| self.assertTrue( | ||
| torch.equal(ov_outputs, transformers_outputs), | ||
|  | @@ -696,7 +706,7 @@ def test_compare_to_transformers(self, model_arch): | |
| transformers_inputs = copy.deepcopy(inputs) | ||
| ov_outputs = ov_model.generate(**inputs, generation_config=gen_config) | ||
| # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them | ||
| if model_arch in ["minicpmv", "minicpmo", "internvl_chat"]: | ||
| if model_arch in ["minicpmv", "minicpmv4", "minicpmv4_5", "minicpmo", "internvl_chat"]: | ||
| ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :] | ||
| with torch.no_grad(): | ||
| transformers_outputs = transformers_model.generate( | ||
|  | @@ -714,7 +724,7 @@ def test_compare_to_transformers(self, model_arch): | |
| transformers_inputs = copy.deepcopy(inputs) | ||
| ov_outputs = ov_model.generate(**inputs, generation_config=gen_config) | ||
| # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them | ||
| if model_arch in ["minicpmv", "minicpmo", "internvl_chat"]: | ||
| if model_arch in ["minicpmv", "minicpmv4", "minicpmv4_5", "minicpmo", "internvl_chat"]: | ||
| ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :] | ||
| with torch.no_grad(): | ||
| transformers_outputs = transformers_model.generate( | ||
|  | @@ -837,6 +847,9 @@ def test_generate_utils(self, model_arch): | |
| input_audio = self._generate_random_audio_data() | ||
| question = "Translate this audio to French" | ||
| inputs = model.preprocess_inputs(**preprocessors, text=question, audio=[input_audio]) | ||
| # skip the temporal_ids which makes the number of loop inconstant: | ||
| # https://huggingface.co/openbmb/MiniCPM-V-4_5/blob/main/resampler.py#L261 | ||
| inputs.pop("temporal_ids", None) | ||
| 
      Comment on lines
    
      +849
     to 
      +851
    
   There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't quite understand. To me it degrades user experience that we make users to skip this parameter after preprocessing to have matched results with the reference. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it will impact the user experience in some cases. I think its one of the limitation for torchscipt. I will try to find a better solution handle this parameter if possible. | ||
| outputs = model.generate(**inputs, max_new_tokens=10) | ||
| # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200 | ||
| outputs = outputs[:, inputs["input_ids"].shape[1] :] | ||
|  | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should use str for versions
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
may i understand why?the version in model's config is a number:
https://huggingface.co/openbmb/MiniCPM-V-4_5/blob/main/config.json#L3
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ah okay I see ! thanks for the clarification.
(it's generally a bad idea to use numbers for versions: 4.0 becomes 4 and 4.10 and 4.1 are the same version 😅)