|  | 
| 41 | 41 |     PhiOnnxConfig, | 
| 42 | 42 |     T5OnnxConfig, | 
| 43 | 43 |     UNetOnnxConfig, | 
|  | 44 | +    VaeEncoderOnnxConfig, | 
| 44 | 45 |     VisionOnnxConfig, | 
| 45 | 46 |     WhisperOnnxConfig, | 
| 46 | 47 | ) | 
|  | 
| 106 | 107 |     Qwen2VLVisionEmbMergerPatcher, | 
| 107 | 108 |     QwenModelPatcher, | 
| 108 | 109 |     RotaryEmbPatcher, | 
|  | 110 | +    SanaTextEncoderModelPatcher, | 
| 109 | 111 |     StatefulSeq2SeqDecoderPatcher, | 
| 110 | 112 |     UpdateCausalMaskModelPatcher, | 
| 111 | 113 |     XverseModelPatcher, | 
| @@ -134,6 +136,8 @@ def init_model_configs(): | 
| 134 | 136 |     if is_diffusers_available() and "fill" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS: | 
| 135 | 137 |         TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline" | 
| 136 | 138 |         TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["fill"] = {"flux": "FluxFillPipeline"} | 
|  | 139 | +        TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["text-to-image"] = ("AutoPipelineForText2Image", "SanaPipeline") | 
|  | 140 | +        TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["text-to-image"]["sana"] = "SanaPipeline" | 
| 137 | 141 | 
 | 
| 138 | 142 |     supported_model_types = [ | 
| 139 | 143 |         "_SUPPORTED_MODEL_TYPE", | 
| @@ -1896,6 +1900,83 @@ class T5EncoderOpenVINOConfig(CLIPTextOpenVINOConfig): | 
| 1896 | 1900 |     pass | 
| 1897 | 1901 | 
 | 
| 1898 | 1902 | 
 | 
|  | 1903 | +@register_in_tasks_manager("gemma2-text-encoder", *["feature-extraction"], library_name="diffusers") | 
|  | 1904 | +class Gemma2TextEncoderOpenVINOConfig(CLIPTextOpenVINOConfig): | 
|  | 1905 | +    @property | 
|  | 1906 | +    def inputs(self) -> Dict[str, Dict[int, str]]: | 
|  | 1907 | +        return { | 
|  | 1908 | +            "input_ids": {0: "batch_size", 1: "sequence_length"}, | 
|  | 1909 | +            "attention_mask": {0: "batch_size", 1: "sequence_length"}, | 
|  | 1910 | +        } | 
|  | 1911 | + | 
|  | 1912 | +    def patch_model_for_export( | 
|  | 1913 | +        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None | 
|  | 1914 | +    ) -> ModelPatcher: | 
|  | 1915 | +        return SanaTextEncoderModelPatcher(self, model, model_kwargs) | 
|  | 1916 | + | 
|  | 1917 | + | 
|  | 1918 | +class DummySanaSeq2SeqDecoderTextWithEncMaskInputGenerator(DummySeq2SeqDecoderTextInputGenerator): | 
|  | 1919 | +    SUPPORTED_INPUT_NAMES = ( | 
|  | 1920 | +        "decoder_input_ids", | 
|  | 1921 | +        "decoder_attention_mask", | 
|  | 1922 | +        "encoder_outputs", | 
|  | 1923 | +        "encoder_hidden_states", | 
|  | 1924 | +        "encoder_attention_mask", | 
|  | 1925 | +    ) | 
|  | 1926 | + | 
|  | 1927 | + | 
|  | 1928 | +class DummySanaTransformerVisionInputGenerator(DummyUnetVisionInputGenerator): | 
|  | 1929 | +    def __init__( | 
|  | 1930 | +        self, | 
|  | 1931 | +        task: str, | 
|  | 1932 | +        normalized_config: NormalizedVisionConfig, | 
|  | 1933 | +        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], | 
|  | 1934 | +        num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], | 
|  | 1935 | +        width: int = DEFAULT_DUMMY_SHAPES["width"] // 8, | 
|  | 1936 | +        height: int = DEFAULT_DUMMY_SHAPES["height"] // 8, | 
|  | 1937 | +        # Reduce img shape by 4 for FLUX to reduce memory usage on conversion | 
|  | 1938 | +        **kwargs, | 
|  | 1939 | +    ): | 
|  | 1940 | +        super().__init__(task, normalized_config, batch_size, num_channels, width=width, height=height, **kwargs) | 
|  | 1941 | + | 
|  | 1942 | + | 
|  | 1943 | +@register_in_tasks_manager("sana-transformer", *["semantic-segmentation"], library_name="diffusers") | 
|  | 1944 | +class SanaTransformerOpenVINOConfig(UNetOpenVINOConfig): | 
|  | 1945 | +    NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args( | 
|  | 1946 | +        image_size="sample_size", | 
|  | 1947 | +        num_channels="in_channels", | 
|  | 1948 | +        hidden_size="caption_channels", | 
|  | 1949 | +        vocab_size="attention_head_dim", | 
|  | 1950 | +        allow_new=True, | 
|  | 1951 | +    ) | 
|  | 1952 | +    DUMMY_INPUT_GENERATOR_CLASSES = ( | 
|  | 1953 | +        DummySanaTransformerVisionInputGenerator, | 
|  | 1954 | +        DummySanaSeq2SeqDecoderTextWithEncMaskInputGenerator, | 
|  | 1955 | +    ) + UNetOpenVINOConfig.DUMMY_INPUT_GENERATOR_CLASSES[1:-1] | 
|  | 1956 | + | 
|  | 1957 | +    @property | 
|  | 1958 | +    def inputs(self): | 
|  | 1959 | +        common_inputs = super().inputs | 
|  | 1960 | +        common_inputs["encoder_attention_mask"] = {0: "batch_size", 1: "sequence_length"} | 
|  | 1961 | +        return common_inputs | 
|  | 1962 | + | 
|  | 1963 | +    def rename_ambiguous_inputs(self, inputs): | 
|  | 1964 | +        #  The input name in the model signature is `x, hence the export input name is updated. | 
|  | 1965 | +        hidden_states = inputs.pop("sample", None) | 
|  | 1966 | +        if hidden_states is not None: | 
|  | 1967 | +            inputs["hidden_states"] = hidden_states | 
|  | 1968 | +        return inputs | 
|  | 1969 | + | 
|  | 1970 | + | 
|  | 1971 | +@register_in_tasks_manager("dcae-encoder", *["semantic-segmentation"], library_name="diffusers") | 
|  | 1972 | +class DcaeEncoderOpenVINOConfig(VaeEncoderOnnxConfig): | 
|  | 1973 | +    @property | 
|  | 1974 | +    def outputs(self) -> Dict[str, Dict[int, str]]: | 
|  | 1975 | +        return { | 
|  | 1976 | +            "latent": {0: "batch_size", 2: "height_latent", 3: "width_latent"}, | 
|  | 1977 | +        } | 
|  | 1978 | + | 
|  | 1979 | + | 
| 1899 | 1980 | class DummyFluxTransformerInputGenerator(DummyVisionInputGenerator): | 
| 1900 | 1981 |     SUPPORTED_INPUT_NAMES = ( | 
| 1901 | 1982 |         "pixel_values", | 
|  | 
0 commit comments