diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 141b055f96..74063939e8 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -134,6 +134,7 @@ "OVFluxPipeline", "OVFluxImg2ImgPipeline", "OVFluxInpaintPipeline", + "OVFluxKontextPipeline", "OVFluxFillPipeline", "OVSanaPipeline", "OVPipelineForImage2Image", @@ -162,6 +163,7 @@ "OVFluxImg2ImgPipeline", "OVFluxInpaintPipeline", "OVFluxFillPipeline", + "OVFluxKontextPipeline", "OVSanaPipeline", "OVPipelineForImage2Image", "OVPipelineForText2Image", diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index 4255b9d505..7fb4e98c35 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -92,6 +92,7 @@ OVFluxFillPipeline, OVFluxImg2ImgPipeline, OVFluxInpaintPipeline, + OVFluxKontextPipeline, OVFluxPipeline, OVLatentConsistencyModelImg2ImgPipeline, OVLatentConsistencyModelPipeline, diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 3f7535091e..14054f7ad7 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -121,6 +121,11 @@ else: SanaSprintPipeline = object +if is_diffusers_version(">=", "0.35.0"): + from diffusers import FluxKontextPipeline +else: + FluxKontextPipeline = object + if is_diffusers_version(">=", "0.35.0"): from diffusers.models.cache_utils import CacheMixin @@ -1676,6 +1681,12 @@ class OVFluxFillPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, Flu auto_model_class = FluxFillPipeline +class OVFluxKontextPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, FluxKontextPipeline): + main_input_name = "image" + export_feature = "image-to-image" + auto_model_class = FluxKontextPipeline + + class OVSanaPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, SanaPipeline): main_input_name = "prompt" export_feature = "text-to-image" @@ -1779,6 +1790,11 @@ def _get_ov_class(pipeline_class_name: str, throw_error_if_not_exist: bool = Tru SUPPORTED_OV_PIPELINES.append(OVSanaSprintPipeline) OV_TEXT2IMAGE_PIPELINES_MAPPING["sana-sprint"] = OVSanaSprintPipeline + +if is_diffusers_version(">=", "0.34.0"): + SUPPORTED_OV_PIPELINES.append(OVFluxKontextPipeline) + OV_IMAGE2IMAGE_PIPELINES_MAPPING["flux-kontext"] = OVFluxKontextPipeline + SUPPORTED_OV_PIPELINES_MAPPINGS = [ OV_TEXT2IMAGE_PIPELINES_MAPPING, OV_IMAGE2IMAGE_PIPELINES_MAPPING, diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index ac60ce516c..9cfbd26dd7 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -139,6 +139,7 @@ "sana": "OVSanaPipeline", "flux": "OVFluxPipeline", "flux-fill": "OVFluxFillPipeline", + "flux-kontext": "OVFluxKontextPipeline", "pix2struct": "OVModelForPix2Struct", "latent-consistency": "OVLatentConsistencyModelPipeline", "open_clip_text": "OVModelOpenCLIPText", diff --git a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py index ed38231e08..e1b2afe3df 100644 --- a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py +++ b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py @@ -246,6 +246,17 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["openvino", "diffusers"]) +class OVFluxKontextPipeline(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) + + class OVSanaPipeline(metaclass=DummyObject): _backends = ["openvino", "diffusers"] diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index 55ff44400d..e3ad3ddb6e 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -497,6 +497,8 @@ class OVPipelineForImage2ImageTest(unittest.TestCase): "stable-diffusion-3", "flux", ] + if is_diffusers_version(">=", "0.35.0"): + SUPPORTED_ARCHITECTURES.append("flux-kontext") AUTOMODEL_CLASS = AutoPipelineForImage2Image OVMODEL_CLASS = OVPipelineForImage2Image TASK = "image-to-image" @@ -508,11 +510,12 @@ def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_ height=height, width=width, batch_size=batch_size, channel=channel, input_type=input_type ) - if model_type in ["flux", "stable-diffusion-3"]: + if model_type in ["flux", "stable-diffusion-3", "flux-kontext"]: inputs["height"] = height inputs["width"] = width - - inputs["strength"] = 0.75 + + if model_type != "flux-kontext": + inputs["strength"] = 0.75 return inputs @@ -544,7 +547,16 @@ def test_num_images_per_prompt(self, model_arch: str): height=height, width=width, batch_size=batch_size, model_type=model_arch ) outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images - self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) + if model_arch != "flux-kontext": + self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) + else: + # output shape is fixed: https://github.com/huggingface/diffusers/blob/v0.35.1/src/diffusers/pipelines/flux/pipeline_flux_kontext.py#L882 + if (height == width): + self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, 1024, 1024, 3)) + elif (height > width): + self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, 1448, 724, 3)) + else: + self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, 724, 1448, 3)) @parameterized.expand(["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]) @require_diffusers @@ -577,8 +589,11 @@ def __call__(self, *args, **kwargs) -> None: @require_diffusers def test_shape(self, model_arch: str): pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - - height, width, batch_size = 128, 64, 1 + if model_arch != "flux-kontext": + # output shape is fixed: https://github.com/huggingface/diffusers/blob/v0.35.1/src/diffusers/pipelines/flux/pipeline_flux_kontext.py#L882 + height, width, batch_size = 128, 64, 1 + else: + height, width, batch_size = 1448, 724, 1 for input_type in ["pil", "np", "pt"]: inputs = self.generate_inputs( @@ -595,7 +610,7 @@ def test_shape(self, model_arch: str): elif output_type == "pt": self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: - if model_arch != "flux": + if not model_arch.startswith("flux"): out_channels = ( pipeline.unet.config.out_channels if pipeline.unet is not None @@ -620,9 +635,10 @@ def test_shape(self, model_arch: str): @require_diffusers def test_compare_to_diffusers_pipeline(self, model_arch: str): height, width, batch_size = 128, 128, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, model_type=model_arch) - - diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, model_type=model_arch) + auto_cls = self.AUTOMODEL_CLASS + + diffusers_pipeline = auto_cls.from_pretrained(MODEL_NAMES[model_arch]) ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) for output_type in ["latent", "np", "pt"]: diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 3cf5ead2d2..932e6e2e72 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -35,6 +35,7 @@ from optimum.intel import ( # noqa OVFluxFillPipeline, OVFluxPipeline, + OVFluxKontextPipeline, OVLatentConsistencyModelPipeline, OVLTXPipeline, OVModelForAudioClassification, @@ -68,6 +69,7 @@ is_openvino_version, is_tokenizers_version, is_transformers_version, + is_diffusers_version, ) @@ -98,6 +100,7 @@ class OVCLIExportTestCase(unittest.TestCase): ("text-to-image", "stable-diffusion-3"), ("text-to-image", "flux"), ("inpainting", "flux-fill"), + ("image-to-image", "flux-kontext"), ("text-to-image", "sana"), ("text-to-video", "ltx-video"), ("feature-extraction", "sam"), @@ -120,6 +123,7 @@ class OVCLIExportTestCase(unittest.TestCase): "stable-diffusion-3": 6 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 2, "flux": 4 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, "flux-fill": 4 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, + "flux-kontext": 4 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, "llava": 2 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, "sana": 2 if is_tokenizers_version("<", "0.20.0") or is_openvino_version(">=", "2024.5") else 0, "ltx-video": 2 if is_tokenizers_version("<", "0.20.0") or is_openvino_version(">=", "2024.5") else 0, diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index deef0c4949..5daaf80445 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -79,6 +79,7 @@ "flaubert": "hf-internal-testing/tiny-random-flaubert", "flux": "katuni4ka/tiny-random-flux", "flux-fill": "katuni4ka/tiny-random-flux-fill", + "flux-kontext": "snake7gun/flux-kontext-random", "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", "gpt2": "hf-internal-testing/tiny-random-gpt2", "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", @@ -269,6 +270,13 @@ "text_encoder": 64, "text_encoder_2": 64, }, + "flux-kontext": { + "transformer": 60, + "vae_decoder": 30, + "vae_encoder": 26, + "text_encoder": 64, + "text_encoder_2": 76, + }, "llava": { "lm_model": 30, "text_embeddings_model": 1,