Skip to content

Commit 54a9727

Browse files
authored
add minicpmv support (#972)
1 parent a46ec67 commit 54a9727

File tree

6 files changed

+977
-30
lines changed

6 files changed

+977
-30
lines changed

optimum/exporters/openvino/model_configs.py

Lines changed: 271 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@
7575
JaisModelPatcher,
7676
LlamaModelPatcher,
7777
LlavaImageEmbeddingModelPatcher,
78+
MiniCPMVImageEmbeddingsModelPatcher,
79+
MiniCPMVResamplerModelPatcher,
7880
MistralModelPatcher,
7981
MixtralModelPatcher,
8082
MPTModelPatcher,
@@ -1738,3 +1740,272 @@ def patch_model_for_export(
17381740
self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
17391741
) -> ModelPatcher:
17401742
return FluxTransfromerModelPatcher(self, model, model_kwargs=model_kwargs)
1743+
1744+
1745+
class DummyMiniCPMVImageInputGenerator(DummyVisionInputGenerator):
1746+
SUPPORTED_INPUT_NAMES = ("pixel_values", "patch_attention_mask", "position_ids")
1747+
1748+
def __init__(
1749+
self,
1750+
task: str,
1751+
normalized_config: NormalizedVisionConfig,
1752+
batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
1753+
num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
1754+
width: int = DEFAULT_DUMMY_SHAPES["width"],
1755+
height: int = DEFAULT_DUMMY_SHAPES["height"],
1756+
**kwargs,
1757+
):
1758+
super().__init__(task, normalized_config, batch_size, num_channels, width, height)
1759+
self.patch_size = normalized_config.config.patch_size
1760+
1761+
def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
1762+
if input_name == "pixel_values":
1763+
return self.random_float_tensor(
1764+
shape=[
1765+
self.batch_size,
1766+
self.num_channels,
1767+
self.patch_size,
1768+
(self.height * self.width) // self.patch_size,
1769+
],
1770+
framework=framework,
1771+
dtype=float_dtype,
1772+
)
1773+
1774+
if input_name == "patch_attention_mask":
1775+
return self.random_int_tensor(
1776+
shape=[self.batch_size, 1, (self.height // self.patch_size) * (self.width // self.patch_size)],
1777+
framework=framework,
1778+
dtype=float_dtype,
1779+
min_value=0,
1780+
max_value=2,
1781+
)
1782+
1783+
if input_name == "position_ids":
1784+
return self.random_int_tensor(
1785+
shape=[self.batch_size, (self.height // self.patch_size) * (self.width // self.patch_size)],
1786+
max_value=self.patch_size,
1787+
)
1788+
1789+
1790+
class DummyMiniCPMVResampleInputGenerator(DummyVisionInputGenerator):
1791+
SUPPORTED_INPUT_NAMES = ("image_feature", "pos_embed", "key_padding_mask")
1792+
1793+
def __init__(
1794+
self,
1795+
task: str,
1796+
normalized_config: NormalizedVisionConfig,
1797+
batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
1798+
num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
1799+
width: int = DEFAULT_DUMMY_SHAPES["width"],
1800+
height: int = DEFAULT_DUMMY_SHAPES["height"],
1801+
**kwargs,
1802+
):
1803+
super().__init__(task, normalized_config, batch_size, num_channels, width, height)
1804+
self.patch_size = normalized_config.config.patch_size
1805+
self.hidden_size = normalized_config.config.hidden_size
1806+
self.img_hidden_size = normalized_config.config.vision_config.hidden_size
1807+
self.feat_size = (normalized_config.config.vision_config.image_size // self.patch_size) * (
1808+
normalized_config.config.vision_config.image_size // self.patch_size
1809+
)
1810+
1811+
def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
1812+
if input_name == "image_feature":
1813+
return self.random_float_tensor(
1814+
shape=[self.batch_size, self.feat_size, self.img_hidden_size], framework=framework, dtype=float_dtype
1815+
)
1816+
1817+
if input_name == "key_padding_mask":
1818+
return self.constant_tensor(
1819+
shape=[self.batch_size, self.feat_size],
1820+
framework=framework,
1821+
value=1,
1822+
dtype=DTYPE_MAPPER.pt(float_dtype),
1823+
)
1824+
1825+
if input_name == "pos_embed":
1826+
return self.random_float_tensor(shape=[self.feat_size, self.batch_size, self.hidden_size])
1827+
1828+
1829+
class MiniCPMVConfigBehavior(str, enum.Enum):
1830+
RESAMPLER = "resampler"
1831+
LANGUAGE = "language"
1832+
VISION_EMBEDDINGS = "vision_embeddings"
1833+
TEXT_EMBEDDINGS = "text_embeddings"
1834+
1835+
1836+
@register_in_tasks_manager("minicpmv", *["image-text-to-text"], library_name="transformers")
1837+
class MiniCPMVOpenVINOConfig(OnnxConfig):
1838+
SUPPORTED_BEHAVIORS = [model_type.value for model_type in MiniCPMVConfigBehavior]
1839+
NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
1840+
DUMMY_INPUT_GENERATOR_CLASSES = ()
1841+
1842+
def __init__(
1843+
self,
1844+
config: "PretrainedConfig",
1845+
task: str = "feature-extraction",
1846+
int_dtype: str = "int64",
1847+
float_dtype: str = "fp32",
1848+
behavior: MiniCPMVConfigBehavior = MiniCPMVConfigBehavior.VISION_EMBEDDINGS,
1849+
preprocessors: Optional[List[Any]] = None,
1850+
):
1851+
super().__init__(
1852+
config=config,
1853+
task=task,
1854+
int_dtype=int_dtype,
1855+
float_dtype=float_dtype,
1856+
preprocessors=preprocessors,
1857+
)
1858+
self._behavior = behavior
1859+
self._orig_config = config
1860+
if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"):
1861+
self._config = config.vision_config
1862+
self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyMiniCPMVImageInputGenerator,)
1863+
if self._behavior == MiniCPMVConfigBehavior.RESAMPLER:
1864+
self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyMiniCPMVResampleInputGenerator,)
1865+
self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config)
1866+
1867+
@property
1868+
def inputs(self) -> Dict[str, Dict[int, str]]:
1869+
if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS:
1870+
return {
1871+
"pixel_values": {0: "batch_size", 2: "height", 3: "width"},
1872+
"patch_attention_mask": {0: "batch_size", 1: "num_patches", 2: "patch_size"},
1873+
"position_ids": {0: "batch_size", 1: "patch_size"},
1874+
}
1875+
if self._behavior == MiniCPMVConfigBehavior.RESAMPLER:
1876+
return {
1877+
"image_feature": {0: "batch_size", 1: "patch_height", 2: "patch_width"},
1878+
"pos_embed": {0: "patch_size", 1: "batch_size", 2: "num_patches"},
1879+
"key_padding_mask": {0: "batch_size", 1: "patch_size"},
1880+
}
1881+
return {}
1882+
1883+
@property
1884+
def outputs(self) -> Dict[str, Dict[int, str]]:
1885+
if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS:
1886+
return {"last_hidden_state": {0: "batch_size", 1: "patch_height", 2: "patch_width"}}
1887+
if self._behavior == MiniCPMVConfigBehavior.RESAMPLER:
1888+
return {"last_hidden_state": {0: "batch_size"}}
1889+
1890+
return {}
1891+
1892+
def with_behavior(
1893+
self,
1894+
behavior: Union[str, MiniCPMVConfigBehavior],
1895+
):
1896+
"""
1897+
Creates a config for different behaviour.
1898+
Args:
1899+
behavior ([`ConfigBehavior`]):
1900+
The behavior to use for the new instance.
1901+
"""
1902+
if isinstance(behavior, str) and not isinstance(behavior, MiniCPMVConfigBehavior):
1903+
behavior = MiniCPMVConfigBehavior(behavior)
1904+
1905+
if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS:
1906+
model_type = "qwen2"
1907+
model_type = model_type.replace("_", "-")
1908+
if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
1909+
raise ValueError(
1910+
f"Unsupported language model type provided `{model_type}`. Please define custom export config"
1911+
)
1912+
1913+
if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
1914+
raise ValueError(
1915+
f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
1916+
)
1917+
internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
1918+
"text-generation-with-past"
1919+
]
1920+
internal_export_config = internal_export_config_class(
1921+
self._orig_config,
1922+
use_past=True,
1923+
use_past_in_inputs=True,
1924+
int_dtype=self.int_dtype,
1925+
float_dtype=self.float_dtype,
1926+
)
1927+
InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS
1928+
export_config = InputEmbedOpenvVINOConfig(
1929+
self._orig_config,
1930+
task="feature-extraction",
1931+
int_dtype=self.int_dtype,
1932+
float_dtype=self.float_dtype,
1933+
)
1934+
return export_config
1935+
1936+
if behavior == MiniCPMVConfigBehavior.LANGUAGE:
1937+
model_type = "qwen2"
1938+
model_type = model_type.replace("_", "-")
1939+
1940+
if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
1941+
raise ValueError(
1942+
f"Unsupported language model type provided `{model_type}`. Please define custom export config"
1943+
)
1944+
1945+
if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
1946+
raise ValueError(
1947+
f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
1948+
)
1949+
internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
1950+
"text-generation-with-past"
1951+
]
1952+
internal_export_config = internal_export_config_class(
1953+
self._orig_config,
1954+
use_past=True,
1955+
use_past_in_inputs=True,
1956+
int_dtype=self.int_dtype,
1957+
float_dtype=self.float_dtype,
1958+
)
1959+
export_config = LMInputEmbedsConfigHelper(internal_export_config)
1960+
export_config._normalized_config = internal_export_config._normalized_config
1961+
return export_config
1962+
1963+
if behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS:
1964+
return self.__class__(
1965+
self._orig_config,
1966+
task=self.task,
1967+
int_dtype=self.int_dtype,
1968+
float_dtype=self.float_dtype,
1969+
behavior=behavior,
1970+
preprocessors=self._preprocessors,
1971+
)
1972+
1973+
if behavior == MiniCPMVConfigBehavior.RESAMPLER:
1974+
return self.__class__(
1975+
self._orig_config,
1976+
task=self.task,
1977+
int_dtype=self.int_dtype,
1978+
float_dtype=self.float_dtype,
1979+
behavior=behavior,
1980+
preprocessors=self._preprocessors,
1981+
)
1982+
1983+
def get_model_for_behavior(self, model, behavior: Union[str, MiniCPMVConfigBehavior]):
1984+
if isinstance(behavior, str) and not isinstance(behavior, MiniCPMVConfigBehavior):
1985+
behavior = MiniCPMVConfigBehavior(behavior)
1986+
1987+
if behavior == MiniCPMVConfigBehavior.LANGUAGE:
1988+
return model.llm
1989+
1990+
if behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS:
1991+
return model.vpm
1992+
1993+
if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS:
1994+
text_embedding = model.get_input_embeddings()
1995+
text_embedding.config = model.llm.config
1996+
return text_embedding
1997+
if behavior == MiniCPMVConfigBehavior.RESAMPLER:
1998+
model.resampler.config = model.vpm.config
1999+
return model.resampler
2000+
2001+
def patch_model_for_export(
2002+
self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
2003+
):
2004+
model_kwargs = model_kwargs or {}
2005+
if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS:
2006+
return MiniCPMVImageEmbeddingsModelPatcher(self, model, model_kwargs)
2007+
2008+
if self._behavior == MiniCPMVConfigBehavior.RESAMPLER:
2009+
return MiniCPMVResamplerModelPatcher(self, model, model_kwargs)
2010+
2011+
return super().patch_model_for_export(model, model_kwargs)

0 commit comments

Comments
 (0)