|
75 | 75 | JaisModelPatcher, |
76 | 76 | LlamaModelPatcher, |
77 | 77 | LlavaImageEmbeddingModelPatcher, |
| 78 | + MiniCPMVImageEmbeddingsModelPatcher, |
| 79 | + MiniCPMVResamplerModelPatcher, |
78 | 80 | MistralModelPatcher, |
79 | 81 | MixtralModelPatcher, |
80 | 82 | MPTModelPatcher, |
@@ -1738,3 +1740,272 @@ def patch_model_for_export( |
1738 | 1740 | self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None |
1739 | 1741 | ) -> ModelPatcher: |
1740 | 1742 | return FluxTransfromerModelPatcher(self, model, model_kwargs=model_kwargs) |
| 1743 | + |
| 1744 | + |
| 1745 | +class DummyMiniCPMVImageInputGenerator(DummyVisionInputGenerator): |
| 1746 | + SUPPORTED_INPUT_NAMES = ("pixel_values", "patch_attention_mask", "position_ids") |
| 1747 | + |
| 1748 | + def __init__( |
| 1749 | + self, |
| 1750 | + task: str, |
| 1751 | + normalized_config: NormalizedVisionConfig, |
| 1752 | + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], |
| 1753 | + num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], |
| 1754 | + width: int = DEFAULT_DUMMY_SHAPES["width"], |
| 1755 | + height: int = DEFAULT_DUMMY_SHAPES["height"], |
| 1756 | + **kwargs, |
| 1757 | + ): |
| 1758 | + super().__init__(task, normalized_config, batch_size, num_channels, width, height) |
| 1759 | + self.patch_size = normalized_config.config.patch_size |
| 1760 | + |
| 1761 | + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): |
| 1762 | + if input_name == "pixel_values": |
| 1763 | + return self.random_float_tensor( |
| 1764 | + shape=[ |
| 1765 | + self.batch_size, |
| 1766 | + self.num_channels, |
| 1767 | + self.patch_size, |
| 1768 | + (self.height * self.width) // self.patch_size, |
| 1769 | + ], |
| 1770 | + framework=framework, |
| 1771 | + dtype=float_dtype, |
| 1772 | + ) |
| 1773 | + |
| 1774 | + if input_name == "patch_attention_mask": |
| 1775 | + return self.random_int_tensor( |
| 1776 | + shape=[self.batch_size, 1, (self.height // self.patch_size) * (self.width // self.patch_size)], |
| 1777 | + framework=framework, |
| 1778 | + dtype=float_dtype, |
| 1779 | + min_value=0, |
| 1780 | + max_value=2, |
| 1781 | + ) |
| 1782 | + |
| 1783 | + if input_name == "position_ids": |
| 1784 | + return self.random_int_tensor( |
| 1785 | + shape=[self.batch_size, (self.height // self.patch_size) * (self.width // self.patch_size)], |
| 1786 | + max_value=self.patch_size, |
| 1787 | + ) |
| 1788 | + |
| 1789 | + |
| 1790 | +class DummyMiniCPMVResampleInputGenerator(DummyVisionInputGenerator): |
| 1791 | + SUPPORTED_INPUT_NAMES = ("image_feature", "pos_embed", "key_padding_mask") |
| 1792 | + |
| 1793 | + def __init__( |
| 1794 | + self, |
| 1795 | + task: str, |
| 1796 | + normalized_config: NormalizedVisionConfig, |
| 1797 | + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], |
| 1798 | + num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], |
| 1799 | + width: int = DEFAULT_DUMMY_SHAPES["width"], |
| 1800 | + height: int = DEFAULT_DUMMY_SHAPES["height"], |
| 1801 | + **kwargs, |
| 1802 | + ): |
| 1803 | + super().__init__(task, normalized_config, batch_size, num_channels, width, height) |
| 1804 | + self.patch_size = normalized_config.config.patch_size |
| 1805 | + self.hidden_size = normalized_config.config.hidden_size |
| 1806 | + self.img_hidden_size = normalized_config.config.vision_config.hidden_size |
| 1807 | + self.feat_size = (normalized_config.config.vision_config.image_size // self.patch_size) * ( |
| 1808 | + normalized_config.config.vision_config.image_size // self.patch_size |
| 1809 | + ) |
| 1810 | + |
| 1811 | + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): |
| 1812 | + if input_name == "image_feature": |
| 1813 | + return self.random_float_tensor( |
| 1814 | + shape=[self.batch_size, self.feat_size, self.img_hidden_size], framework=framework, dtype=float_dtype |
| 1815 | + ) |
| 1816 | + |
| 1817 | + if input_name == "key_padding_mask": |
| 1818 | + return self.constant_tensor( |
| 1819 | + shape=[self.batch_size, self.feat_size], |
| 1820 | + framework=framework, |
| 1821 | + value=1, |
| 1822 | + dtype=DTYPE_MAPPER.pt(float_dtype), |
| 1823 | + ) |
| 1824 | + |
| 1825 | + if input_name == "pos_embed": |
| 1826 | + return self.random_float_tensor(shape=[self.feat_size, self.batch_size, self.hidden_size]) |
| 1827 | + |
| 1828 | + |
| 1829 | +class MiniCPMVConfigBehavior(str, enum.Enum): |
| 1830 | + RESAMPLER = "resampler" |
| 1831 | + LANGUAGE = "language" |
| 1832 | + VISION_EMBEDDINGS = "vision_embeddings" |
| 1833 | + TEXT_EMBEDDINGS = "text_embeddings" |
| 1834 | + |
| 1835 | + |
| 1836 | +@register_in_tasks_manager("minicpmv", *["image-text-to-text"], library_name="transformers") |
| 1837 | +class MiniCPMVOpenVINOConfig(OnnxConfig): |
| 1838 | + SUPPORTED_BEHAVIORS = [model_type.value for model_type in MiniCPMVConfigBehavior] |
| 1839 | + NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig |
| 1840 | + DUMMY_INPUT_GENERATOR_CLASSES = () |
| 1841 | + |
| 1842 | + def __init__( |
| 1843 | + self, |
| 1844 | + config: "PretrainedConfig", |
| 1845 | + task: str = "feature-extraction", |
| 1846 | + int_dtype: str = "int64", |
| 1847 | + float_dtype: str = "fp32", |
| 1848 | + behavior: MiniCPMVConfigBehavior = MiniCPMVConfigBehavior.VISION_EMBEDDINGS, |
| 1849 | + preprocessors: Optional[List[Any]] = None, |
| 1850 | + ): |
| 1851 | + super().__init__( |
| 1852 | + config=config, |
| 1853 | + task=task, |
| 1854 | + int_dtype=int_dtype, |
| 1855 | + float_dtype=float_dtype, |
| 1856 | + preprocessors=preprocessors, |
| 1857 | + ) |
| 1858 | + self._behavior = behavior |
| 1859 | + self._orig_config = config |
| 1860 | + if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"): |
| 1861 | + self._config = config.vision_config |
| 1862 | + self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyMiniCPMVImageInputGenerator,) |
| 1863 | + if self._behavior == MiniCPMVConfigBehavior.RESAMPLER: |
| 1864 | + self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyMiniCPMVResampleInputGenerator,) |
| 1865 | + self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) |
| 1866 | + |
| 1867 | + @property |
| 1868 | + def inputs(self) -> Dict[str, Dict[int, str]]: |
| 1869 | + if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS: |
| 1870 | + return { |
| 1871 | + "pixel_values": {0: "batch_size", 2: "height", 3: "width"}, |
| 1872 | + "patch_attention_mask": {0: "batch_size", 1: "num_patches", 2: "patch_size"}, |
| 1873 | + "position_ids": {0: "batch_size", 1: "patch_size"}, |
| 1874 | + } |
| 1875 | + if self._behavior == MiniCPMVConfigBehavior.RESAMPLER: |
| 1876 | + return { |
| 1877 | + "image_feature": {0: "batch_size", 1: "patch_height", 2: "patch_width"}, |
| 1878 | + "pos_embed": {0: "patch_size", 1: "batch_size", 2: "num_patches"}, |
| 1879 | + "key_padding_mask": {0: "batch_size", 1: "patch_size"}, |
| 1880 | + } |
| 1881 | + return {} |
| 1882 | + |
| 1883 | + @property |
| 1884 | + def outputs(self) -> Dict[str, Dict[int, str]]: |
| 1885 | + if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS: |
| 1886 | + return {"last_hidden_state": {0: "batch_size", 1: "patch_height", 2: "patch_width"}} |
| 1887 | + if self._behavior == MiniCPMVConfigBehavior.RESAMPLER: |
| 1888 | + return {"last_hidden_state": {0: "batch_size"}} |
| 1889 | + |
| 1890 | + return {} |
| 1891 | + |
| 1892 | + def with_behavior( |
| 1893 | + self, |
| 1894 | + behavior: Union[str, MiniCPMVConfigBehavior], |
| 1895 | + ): |
| 1896 | + """ |
| 1897 | + Creates a config for different behaviour. |
| 1898 | + Args: |
| 1899 | + behavior ([`ConfigBehavior`]): |
| 1900 | + The behavior to use for the new instance. |
| 1901 | + """ |
| 1902 | + if isinstance(behavior, str) and not isinstance(behavior, MiniCPMVConfigBehavior): |
| 1903 | + behavior = MiniCPMVConfigBehavior(behavior) |
| 1904 | + |
| 1905 | + if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS: |
| 1906 | + model_type = "qwen2" |
| 1907 | + model_type = model_type.replace("_", "-") |
| 1908 | + if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: |
| 1909 | + raise ValueError( |
| 1910 | + f"Unsupported language model type provided `{model_type}`. Please define custom export config" |
| 1911 | + ) |
| 1912 | + |
| 1913 | + if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]: |
| 1914 | + raise ValueError( |
| 1915 | + f"Export config for text generation for `{model_type}` is not available. Please define custom export config" |
| 1916 | + ) |
| 1917 | + internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][ |
| 1918 | + "text-generation-with-past" |
| 1919 | + ] |
| 1920 | + internal_export_config = internal_export_config_class( |
| 1921 | + self._orig_config, |
| 1922 | + use_past=True, |
| 1923 | + use_past_in_inputs=True, |
| 1924 | + int_dtype=self.int_dtype, |
| 1925 | + float_dtype=self.float_dtype, |
| 1926 | + ) |
| 1927 | + InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS |
| 1928 | + export_config = InputEmbedOpenvVINOConfig( |
| 1929 | + self._orig_config, |
| 1930 | + task="feature-extraction", |
| 1931 | + int_dtype=self.int_dtype, |
| 1932 | + float_dtype=self.float_dtype, |
| 1933 | + ) |
| 1934 | + return export_config |
| 1935 | + |
| 1936 | + if behavior == MiniCPMVConfigBehavior.LANGUAGE: |
| 1937 | + model_type = "qwen2" |
| 1938 | + model_type = model_type.replace("_", "-") |
| 1939 | + |
| 1940 | + if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: |
| 1941 | + raise ValueError( |
| 1942 | + f"Unsupported language model type provided `{model_type}`. Please define custom export config" |
| 1943 | + ) |
| 1944 | + |
| 1945 | + if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]: |
| 1946 | + raise ValueError( |
| 1947 | + f"Export config for text generation for `{model_type}` is not available. Please define custom export config" |
| 1948 | + ) |
| 1949 | + internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][ |
| 1950 | + "text-generation-with-past" |
| 1951 | + ] |
| 1952 | + internal_export_config = internal_export_config_class( |
| 1953 | + self._orig_config, |
| 1954 | + use_past=True, |
| 1955 | + use_past_in_inputs=True, |
| 1956 | + int_dtype=self.int_dtype, |
| 1957 | + float_dtype=self.float_dtype, |
| 1958 | + ) |
| 1959 | + export_config = LMInputEmbedsConfigHelper(internal_export_config) |
| 1960 | + export_config._normalized_config = internal_export_config._normalized_config |
| 1961 | + return export_config |
| 1962 | + |
| 1963 | + if behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS: |
| 1964 | + return self.__class__( |
| 1965 | + self._orig_config, |
| 1966 | + task=self.task, |
| 1967 | + int_dtype=self.int_dtype, |
| 1968 | + float_dtype=self.float_dtype, |
| 1969 | + behavior=behavior, |
| 1970 | + preprocessors=self._preprocessors, |
| 1971 | + ) |
| 1972 | + |
| 1973 | + if behavior == MiniCPMVConfigBehavior.RESAMPLER: |
| 1974 | + return self.__class__( |
| 1975 | + self._orig_config, |
| 1976 | + task=self.task, |
| 1977 | + int_dtype=self.int_dtype, |
| 1978 | + float_dtype=self.float_dtype, |
| 1979 | + behavior=behavior, |
| 1980 | + preprocessors=self._preprocessors, |
| 1981 | + ) |
| 1982 | + |
| 1983 | + def get_model_for_behavior(self, model, behavior: Union[str, MiniCPMVConfigBehavior]): |
| 1984 | + if isinstance(behavior, str) and not isinstance(behavior, MiniCPMVConfigBehavior): |
| 1985 | + behavior = MiniCPMVConfigBehavior(behavior) |
| 1986 | + |
| 1987 | + if behavior == MiniCPMVConfigBehavior.LANGUAGE: |
| 1988 | + return model.llm |
| 1989 | + |
| 1990 | + if behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS: |
| 1991 | + return model.vpm |
| 1992 | + |
| 1993 | + if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS: |
| 1994 | + text_embedding = model.get_input_embeddings() |
| 1995 | + text_embedding.config = model.llm.config |
| 1996 | + return text_embedding |
| 1997 | + if behavior == MiniCPMVConfigBehavior.RESAMPLER: |
| 1998 | + model.resampler.config = model.vpm.config |
| 1999 | + return model.resampler |
| 2000 | + |
| 2001 | + def patch_model_for_export( |
| 2002 | + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None |
| 2003 | + ): |
| 2004 | + model_kwargs = model_kwargs or {} |
| 2005 | + if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS: |
| 2006 | + return MiniCPMVImageEmbeddingsModelPatcher(self, model, model_kwargs) |
| 2007 | + |
| 2008 | + if self._behavior == MiniCPMVConfigBehavior.RESAMPLER: |
| 2009 | + return MiniCPMVResamplerModelPatcher(self, model, model_kwargs) |
| 2010 | + |
| 2011 | + return super().patch_model_for_export(model, model_kwargs) |
0 commit comments