From ebe5be2b13b8c43e4a0840549167fbfc1631826b Mon Sep 17 00:00:00 2001 From: ethan Date: Sun, 6 Jul 2025 19:41:46 -0700 Subject: [PATCH 01/15] add kontext support --- optimum/intel/__init__.py | 2 ++ optimum/intel/openvino/__init__.py | 1 + optimum/intel/openvino/modeling_diffusion.py | 15 ++++++++++++++- .../utils/dummy_openvino_and_diffusers_objects.py | 10 ++++++++++ 4 files changed, 27 insertions(+), 1 deletion(-) diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index b49f17944a..cc08e3b09b 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -124,6 +124,7 @@ "OVFluxImg2ImgPipeline", "OVFluxInpaintPipeline", "OVFluxFillPipeline", + "OVFluxKontextPipeline", "OVSanaPipeline", "OVPipelineForImage2Image", "OVPipelineForText2Image", @@ -150,6 +151,7 @@ "OVFluxImg2ImgPipeline", "OVFluxInpaintPipeline", "OVFluxFillPipeline", + "OVFluxKontextPipeline", "OVSanaPipeline", "OVPipelineForImage2Image", "OVPipelineForText2Image", diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index bc1266467b..f9a6f39c52 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -91,6 +91,7 @@ OVFluxImg2ImgPipeline, OVFluxInpaintPipeline, OVFluxPipeline, + OVFluxKontextPipeline, OVLatentConsistencyModelImg2ImgPipeline, OVLatentConsistencyModelPipeline, OVLTXPipeline, diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 06321a14ae..b371c89b32 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -120,6 +120,11 @@ from diffusers import SanaSprintPipeline else: SanaSprintPipeline = object + +if is_diffusers_version(">", "0.34.0"): + from diffusers import FluxKontextPipeline +else: + FluxKontextPipeline = object DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER = "transformer" @@ -1659,12 +1664,15 @@ class OVFluxInpaintPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, export_feature = "inpainting" auto_model_class = FluxInpaintPipeline - class OVFluxFillPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, FluxFillPipeline): main_input_name = "image" export_feature = "inpainting" auto_model_class = FluxFillPipeline +class OVFluxKontextPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, FluxKontextPipeline): + main_input_name = "image" + export_feature = "image-to-image" + auto_model_class = FluxKontextPipeline class OVSanaPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, SanaPipeline): main_input_name = "prompt" @@ -1768,6 +1776,11 @@ def _get_ov_class(pipeline_class_name: str, throw_error_if_not_exist: bool = Tru if is_diffusers_version(">=", "0.33.0"): SUPPORTED_OV_PIPELINES.append(OVSanaSprintPipeline) OV_TEXT2IMAGE_PIPELINES_MAPPING["sana-sprint"] = OVSanaSprintPipeline + + +if is_diffusers_version(">", "0.34.0"): + SUPPORTED_OV_PIPELINES.extend([OVFluxKontextPipeline]) + OV_IMAGE2IMAGE_PIPELINES_MAPPING["flux"] = OVFluxKontextPipeline SUPPORTED_OV_PIPELINES_MAPPINGS = [ OV_TEXT2IMAGE_PIPELINES_MAPPING, diff --git a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py index ed38231e08..4845732922 100644 --- a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py +++ b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py @@ -246,6 +246,16 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["openvino", "diffusers"]) +class OVFluxKontextPipeline(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) + class OVSanaPipeline(metaclass=DummyObject): _backends = ["openvino", "diffusers"] From d2a5a92a210d027e7377973ec3a9cb8387b5a8cc Mon Sep 17 00:00:00 2001 From: ethan Date: Sun, 6 Jul 2025 19:45:46 -0700 Subject: [PATCH 02/15] add kontext support --- optimum/intel/__init__.py | 8 +- optimum/intel/openvino/modeling_diffusion.py | 206 +++++------------- .../dummy_openvino_and_diffusers_objects.py | 3 +- 3 files changed, 53 insertions(+), 164 deletions(-) diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index cc08e3b09b..5bbf292f3f 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -167,9 +167,7 @@ except OptionalDependencyNotAvailable: from .utils import dummy_openvino_objects - _import_structure["utils.dummy_openvino_objects"] = [ - name for name in dir(dummy_openvino_objects) if not name.startswith("_") - ] + _import_structure["utils.dummy_openvino_objects"] = [name for name in dir(dummy_openvino_objects) if not name.startswith("_")] else: _import_structure["openvino"].extend( [ @@ -206,9 +204,7 @@ except OptionalDependencyNotAvailable: from .utils import dummy_neural_compressor_objects - _import_structure["utils.dummy_neural_compressor_objects"] = [ - name for name in dir(dummy_neural_compressor_objects) if not name.startswith("_") - ] + _import_structure["utils.dummy_neural_compressor_objects"] = [name for name in dir(dummy_neural_compressor_objects) if not name.startswith("_")] else: _import_structure["neural_compressor"] = [ "INCConfig", diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index b371c89b32..907cd4b158 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -120,7 +120,7 @@ from diffusers import SanaSprintPipeline else: SanaSprintPipeline = object - + if is_diffusers_version(">", "0.34.0"): from diffusers import FluxKontextPipeline else: @@ -201,35 +201,15 @@ def __init__( ) self.unet = OVModelUnet(unet, self, DIFFUSION_MODEL_UNET_SUBFOLDER) if unet is not None else None - self.transformer = ( - OVModelTransformer(transformer, self, DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER) - if transformer is not None - else None - ) + self.transformer = OVModelTransformer(transformer, self, DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER) if transformer is not None else None if unet is None and transformer is None: raise ValueError("`unet` or `transformer` model should be provided for pipeline work") self.vae_decoder = OVModelVaeDecoder(vae_decoder, self, DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER) - self.vae_encoder = ( - OVModelVaeEncoder(vae_encoder, self, DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER) - if vae_encoder is not None - else None - ) - self.text_encoder = ( - OVModelTextEncoder(text_encoder, self, DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER) - if text_encoder is not None - else None - ) - self.text_encoder_2 = ( - OVModelTextEncoder(text_encoder_2, self, DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER) - if text_encoder_2 is not None - else None - ) - self.text_encoder_3 = ( - OVModelTextEncoder(text_encoder_3, self, DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER) - if text_encoder_3 is not None - else None - ) + self.vae_encoder = OVModelVaeEncoder(vae_encoder, self, DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER) if vae_encoder is not None else None + self.text_encoder = OVModelTextEncoder(text_encoder, self, DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER) if text_encoder is not None else None + self.text_encoder_2 = OVModelTextEncoder(text_encoder_2, self, DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER) if text_encoder_2 is not None else None + self.text_encoder_3 = OVModelTextEncoder(text_encoder_3, self, DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER) if text_encoder_3 is not None else None # We wrap the VAE Decoder & Encoder in a single object to simulate diffusers API self.vae = OVModelVae(decoder=self.vae_decoder, encoder=self.vae_encoder) @@ -310,9 +290,7 @@ def _save_pretrained(self, save_directory: Union[str, Path]): The directory where to save the model files """ if self._compile_only: - raise ValueError( - "`save_pretrained()` is not supported with `compile_only` mode, please initialize model without this option" - ) + raise ValueError("`save_pretrained()` is not supported with `compile_only` mode, please initialize model without this option") save_directory = Path(save_directory) @@ -330,11 +308,7 @@ def _save_pretrained(self, save_directory: Union[str, Path]): dst_path = save_path / OV_XML_FILE_NAME dst_path.parent.mkdir(parents=True, exist_ok=True) openvino.save_model(model.model, dst_path, compress_to_fp16=False) - model_dir = ( - self.model_save_dir - if not isinstance(self.model_save_dir, TemporaryDirectory) - else self.model_save_dir.name - ) + model_dir = self.model_save_dir if not isinstance(self.model_save_dir, TemporaryDirectory) else self.model_save_dir.name config_path = Path(model_dir) / save_path.name / CONFIG_NAME if config_path.is_file(): config_save_path = save_path / CONFIG_NAME @@ -365,11 +339,7 @@ def _save_config(self, save_directory): Saves a model configuration into a directory, so that it can be re-loaded using the [`from_pretrained`] class method. """ - model_dir = ( - self.model_save_dir - if not isinstance(self.model_save_dir, TemporaryDirectory) - else self.model_save_dir.name - ) + model_dir = self.model_save_dir if not isinstance(self.model_save_dir, TemporaryDirectory) else self.model_save_dir.name save_dir = Path(save_directory) original_config = Path(model_dir) / self.config_name if original_config.exists(): @@ -527,11 +497,7 @@ def _from_pretrained( ov_config = kwargs.get("ov_config", {}) device = kwargs.get("device", "CPU") vae_ov_conifg = {**ov_config} - if ( - "GPU" in device.upper() - and "INFERENCE_PRECISION_HINT" not in vae_ov_conifg - and is_openvino_version("<=", "2025.0") - ): + if "GPU" in device.upper() and "INFERENCE_PRECISION_HINT" not in vae_ov_conifg and is_openvino_version("<=", "2025.0"): vae_model_path = models["vae_decoder"] required_upcast = check_scale_available(vae_model_path) if required_upcast: @@ -672,15 +638,10 @@ def to(self, *args, device: Optional[str] = None, dtype: Optional[torch.dtype] = self._device = device.upper() self.clear_requests() elif device is not None: - raise ValueError( - "The `device` argument should be a string representing the device on which the model should be loaded." - ) + raise ValueError("The `device` argument should be a string representing the device on which the model should be loaded.") if dtype is not None and dtype != self.dtype: - raise NotImplementedError( - f"Cannot change the dtype of the model from {self.dtype} to {dtype}. " - f"Please export the model with the desired dtype." - ) + raise NotImplementedError(f"Cannot change the dtype of the model from {self.dtype} to {dtype}. " f"Please export the model with the desired dtype.") return self @@ -690,9 +651,7 @@ def height(self) -> int: height = model.inputs[0].get_partial_shape()[-2] if height.is_dynamic: return -1 - return height.get_length() * ( - self.vae_scale_factor if hasattr(self, "vae_scale_factor") else self.vae_spatial_compression_ratio - ) + return height.get_length() * (self.vae_scale_factor if hasattr(self, "vae_scale_factor") else self.vae_spatial_compression_ratio) @property def width(self) -> int: @@ -700,9 +659,7 @@ def width(self) -> int: width = model.inputs[0].get_partial_shape()[-1] if width.is_dynamic: return -1 - return width.get_length() * ( - self.vae_scale_factor if hasattr(self, "vae_scale_factor") else self.vae_spatial_compression_ratio - ) + return width.get_length() * (self.vae_scale_factor if hasattr(self, "vae_scale_factor") else self.vae_spatial_compression_ratio) @property def batch_size(self) -> int: @@ -798,9 +755,7 @@ def _reshape_transformer( elif inputs.get_any_name() == "hidden_states": in_channels = self.transformer.config.get("in_channels", None) if in_channels is None: - in_channels = ( - shapes[inputs][1] if inputs.get_partial_shape().rank.get_length() == 4 else shapes[inputs][2] - ) + in_channels = shapes[inputs][1] if inputs.get_partial_shape().rank.get_length() == 4 else shapes[inputs][2] if in_channels.is_dynamic: logger.warning( "Could not identify `in_channels` from the unet configuration, to statically reshape the unet please provide a configuration." @@ -814,11 +769,7 @@ def _reshape_transformer( elif inputs.get_any_name() == "pooled_projections": shapes[inputs] = [batch_size, self.transformer.config["pooled_projection_dim"]] elif inputs.get_any_name() == "img_ids": - shapes[inputs] = ( - [batch_size, packed_height_width, 3] - if is_diffusers_version("<", "0.31.0") - else [packed_height_width, 3] - ) + shapes[inputs] = [batch_size, packed_height_width, 3] if is_diffusers_version("<", "0.31.0") else [packed_height_width, 3] elif inputs.get_any_name() == "txt_ids": shapes[inputs] = [batch_size, -1, 3] if is_diffusers_version("<", "0.31.0") else [-1, 3] elif inputs.get_any_name() in ["height", "width", "num_frames", "rope_interpolation_scale"]: @@ -892,9 +843,7 @@ def _reshape_vae_decoder( def reshape(self, batch_size: int, height: int, width: int, num_images_per_prompt: int = -1, num_frames: int = -1): if self._compile_only: - raise ValueError( - "`reshape()` is not supported with `compile_only` mode, please initialize model without this option" - ) + raise ValueError("`reshape()` is not supported with `compile_only` mode, please initialize model without this option") self.is_dynamic = -1 in {batch_size, height, width, num_images_per_prompt} @@ -905,15 +854,11 @@ def reshape(self, batch_size: int, height: int, width: int, num_images_per_promp tokenizer_max_len = -1 else: tokenizer_max_len = ( - getattr(self.tokenizer, "model_max_length", -1) - if self.tokenizer is not None - else getattr(self.tokenizer_2, "model_max_length", -1) + getattr(self.tokenizer, "model_max_length", -1) if self.tokenizer is not None else getattr(self.tokenizer_2, "model_max_length", -1) ) if self.unet is not None: - self.unet.model = self._reshape_unet( - self.unet.model, batch_size, height, width, num_images_per_prompt, tokenizer_max_len - ) + self.unet.model = self._reshape_unet(self.unet.model, batch_size, height, width, num_images_per_prompt, tokenizer_max_len) if self.transformer is not None: self.transformer.model = self._reshape_transformer( self.transformer.model, @@ -924,14 +869,10 @@ def reshape(self, batch_size: int, height: int, width: int, num_images_per_promp tokenizer_max_len, num_frames=num_frames, ) - self.vae_decoder.model = self._reshape_vae_decoder( - self.vae_decoder.model, height, width, num_images_per_prompt, num_frames=num_frames - ) + self.vae_decoder.model = self._reshape_vae_decoder(self.vae_decoder.model, height, width, num_images_per_prompt, num_frames=num_frames) if self.vae_encoder is not None: - self.vae_encoder.model = self._reshape_vae_encoder( - self.vae_encoder.model, batch_size, height, width, num_frames=num_frames - ) + self.vae_encoder.model = self._reshape_vae_encoder(self.vae_encoder.model, batch_size, height, width, num_frames=num_frames) if self.text_encoder is not None: self.text_encoder.model = self._reshape_text_encoder( @@ -940,16 +881,13 @@ def reshape(self, batch_size: int, height: int, width: int, num_images_per_promp batch_size, ( getattr(self.tokenizer, "model_max_length", -1) - if "Gemma" not in self.tokenizer.__class__.__name__ - and not self.__class__.__name__.startswith("OVLTX") + if "Gemma" not in self.tokenizer.__class__.__name__ and not self.__class__.__name__.startswith("OVLTX") else -1 ), ) if self.text_encoder_2 is not None: - self.text_encoder_2.model = self._reshape_text_encoder( - self.text_encoder_2.model, batch_size, getattr(self.tokenizer_2, "model_max_length", -1) - ) + self.text_encoder_2.model = self._reshape_text_encoder(self.text_encoder_2.model, batch_size, getattr(self.tokenizer_2, "model_max_length", -1)) if self.text_encoder_3 is not None: self.text_encoder_3.model = self._reshape_text_encoder(self.text_encoder_3.model, batch_size, -1) @@ -962,9 +900,7 @@ def half(self): Converts all the model weights to FP16 for more efficient inference on GPU. """ if self._compile_only: - raise ValueError( - "`half()` is not supported with `compile_only` mode, please initialize model without this option" - ) + raise ValueError("`half()` is not supported with `compile_only` mode, please initialize model without this option") for submodel in self.ov_submodels.values(): compress_model_transformation(submodel) @@ -975,9 +911,7 @@ def half(self): def clear_requests(self): if self._compile_only: - raise ValueError( - "`clear_requests()` is not supported with `compile_only` mode, please initialize model without this option" - ) + raise ValueError("`clear_requests()` is not supported with `compile_only` mode, please initialize model without this option") for submodel_name in self._ov_submodel_names: getattr(self, submodel_name).request = None @@ -1064,9 +998,7 @@ def __call__(self, *args, **kwargs): # Disable this behavior for static shape pipeline if self.auto_model_class.__name__.startswith("Sana") and shapes_overridden: sig_resolution_bining_idx = ( - list(sig.parameters).index("use_resolution_binning") - if "use_resolution_binning" in sig.parameters - else len(sig.parameters) + list(sig.parameters).index("use_resolution_binning") if "use_resolution_binning" in sig.parameters else len(sig.parameters) ) if len(args) > sig_resolution_bining_idx: args[sig_resolution_bining_idx] = False @@ -1120,11 +1052,7 @@ def dtype(self) -> torch.dtype: def _compile(self): if self.request is None: - if ( - "CACHE_DIR" not in self.ov_config.keys() - and not str(self.model_save_dir).startswith(gettempdir()) - and "GPU" in self._device - ): + if "CACHE_DIR" not in self.ov_config.keys() and not str(self.model_save_dir).startswith(gettempdir()) and "GPU" in self._device: self.ov_config["CACHE_DIR"] = os.path.join(self.model_save_dir, "model_cache") logger.info(f"Compiling the {self.model_name} to {self._device} ...") @@ -1144,15 +1072,10 @@ def to(self, *args, device: Optional[str] = None, dtype: Optional[torch.dtype] = self._device = device.upper() self.request = None elif device is not None: - raise ValueError( - "The `device` argument should be a string representing the device on which the model should be loaded." - ) + raise ValueError("The `device` argument should be a string representing the device on which the model should be loaded.") if dtype is not None and dtype != self.dtype: - raise NotImplementedError( - f"Cannot change the dtype of the model from {self.dtype} to {dtype}. " - f"Please export the model with the desired dtype." - ) + raise NotImplementedError(f"Cannot change the dtype of the model from {self.dtype} to {dtype}. " f"Please export the model with the desired dtype.") return self @@ -1170,9 +1093,7 @@ def modules(self): class OVModelTextEncoder(OVPipelinePart): def __init__(self, model: openvino.Model, parent_pipeline: OVDiffusionPipeline, model_name: str = ""): super().__init__(model, parent_pipeline, model_name) - self.hidden_states_output_names = [ - name for out in self.model.outputs for name in out.names if name.startswith("hidden_states") - ] + self.hidden_states_output_names = [name for out in self.model.outputs for name in out.names if name.startswith("hidden_states")] self.input_names = [inp.get_any_name() for inp in self.model.inputs] def forward( @@ -1196,11 +1117,7 @@ def forward( model_outputs["pooler_output"] = torch.from_numpy(ov_outputs[1]) if self.hidden_states_output_names and "last_hidden_state" not in model_outputs: model_outputs["last_hidden_state"] = torch.from_numpy(ov_outputs[self.hidden_states_output_names[-1]]) - if ( - self.hidden_states_output_names - and output_hidden_states - or getattr(self.config, "output_hidden_states", False) - ): + if self.hidden_states_output_names and output_hidden_states or getattr(self.config, "output_hidden_states", False): hidden_states = [torch.from_numpy(ov_outputs[out_name]) for out_name in self.hidden_states_output_names] model_outputs["hidden_states"] = hidden_states @@ -1357,9 +1274,7 @@ def forward( model_outputs["latents"] = model_outputs.pop("latent_sample") if "latent_parameters" in model_outputs: - model_outputs["latent_dist"] = DiagonalGaussianDistribution( - parameters=model_outputs.pop("latent_parameters") - ) + model_outputs["latent_dist"] = DiagonalGaussianDistribution(parameters=model_outputs.pop("latent_parameters")) if return_dict: return model_outputs @@ -1476,9 +1391,7 @@ class OVStableDiffusionPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMix auto_model_class = StableDiffusionPipeline -class OVStableDiffusionImg2ImgPipeline( - OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionImg2ImgPipeline -): +class OVStableDiffusionImg2ImgPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionImg2ImgPipeline): """ OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_img2img#diffusers.StableDiffusionImg2ImgPipeline). """ @@ -1488,9 +1401,7 @@ class OVStableDiffusionImg2ImgPipeline( auto_model_class = StableDiffusionImg2ImgPipeline -class OVStableDiffusionInpaintPipeline( - OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionInpaintPipeline -): +class OVStableDiffusionInpaintPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionInpaintPipeline): """ OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_inpaint#diffusers.StableDiffusionInpaintPipeline). """ @@ -1523,9 +1434,7 @@ def _get_add_time_ids( return add_time_ids -class OVStableDiffusionXLImg2ImgPipeline( - OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionXLImg2ImgPipeline -): +class OVStableDiffusionXLImg2ImgPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionXLImg2ImgPipeline): """ OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline). """ @@ -1549,9 +1458,7 @@ def _get_add_time_ids( ): if self.config.requires_aesthetics_score: add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) - add_neg_time_ids = list( - negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,) - ) + add_neg_time_ids = list(negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)) else: add_time_ids = list(original_size + crops_coords_top_left + target_size) add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size) @@ -1562,9 +1469,7 @@ def _get_add_time_ids( return add_time_ids, add_neg_time_ids -class OVStableDiffusionXLInpaintPipeline( - OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionXLInpaintPipeline -): +class OVStableDiffusionXLInpaintPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionXLInpaintPipeline): """ OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLInpaintPipeline). """ @@ -1588,9 +1493,7 @@ def _get_add_time_ids( ): if self.config.requires_aesthetics_score: add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) - add_neg_time_ids = list( - negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,) - ) + add_neg_time_ids = list(negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)) else: add_time_ids = list(original_size + crops_coords_top_left + target_size) add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size) @@ -1601,9 +1504,7 @@ def _get_add_time_ids( return add_time_ids, add_neg_time_ids -class OVLatentConsistencyModelPipeline( - OVDiffusionPipeline, OVTextualInversionLoaderMixin, LatentConsistencyModelPipeline -): +class OVLatentConsistencyModelPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, LatentConsistencyModelPipeline): """ OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline). """ @@ -1613,9 +1514,7 @@ class OVLatentConsistencyModelPipeline( auto_model_class = LatentConsistencyModelPipeline -class OVLatentConsistencyModelImg2ImgPipeline( - OVDiffusionPipeline, OVTextualInversionLoaderMixin, LatentConsistencyModelImg2ImgPipeline -): +class OVLatentConsistencyModelImg2ImgPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, LatentConsistencyModelImg2ImgPipeline): """ OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency_img2img#diffusers.LatentConsistencyModelImg2ImgPipeline). """ @@ -1631,17 +1530,13 @@ class OVStableDiffusion3Pipeline(OVDiffusionPipeline, OVTextualInversionLoaderMi auto_model_class = StableDiffusion3Pipeline -class OVStableDiffusion3Img2ImgPipeline( - OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusion3Img2ImgPipeline -): +class OVStableDiffusion3Img2ImgPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusion3Img2ImgPipeline): main_input_name = "image" export_feature = "image-to-image" auto_model_class = StableDiffusion3Img2ImgPipeline -class OVStableDiffusion3InpaintPipeline( - OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusion3InpaintPipeline -): +class OVStableDiffusion3InpaintPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusion3InpaintPipeline): main_input_name = "image" export_feature = "inpainting" auto_model_class = StableDiffusion3InpaintPipeline @@ -1664,16 +1559,19 @@ class OVFluxInpaintPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, export_feature = "inpainting" auto_model_class = FluxInpaintPipeline + class OVFluxFillPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, FluxFillPipeline): main_input_name = "image" export_feature = "inpainting" auto_model_class = FluxFillPipeline + class OVFluxKontextPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, FluxKontextPipeline): main_input_name = "image" export_feature = "image-to-image" auto_model_class = FluxKontextPipeline + class OVSanaPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, SanaPipeline): main_input_name = "prompt" export_feature = "text-to-image" @@ -1706,10 +1604,7 @@ class OVLTXPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, LTXPipel def _get_ov_class(pipeline_class_name: str, throw_error_if_not_exist: bool = True): for ov_pipeline_class in SUPPORTED_OV_PIPELINES: - if ( - ov_pipeline_class.__name__ == pipeline_class_name - or ov_pipeline_class.auto_model_class.__name__ == pipeline_class_name - ): + if ov_pipeline_class.__name__ == pipeline_class_name or ov_pipeline_class.auto_model_class.__name__ == pipeline_class_name: return ov_pipeline_class if throw_error_if_not_exist: @@ -1776,8 +1671,8 @@ def _get_ov_class(pipeline_class_name: str, throw_error_if_not_exist: bool = Tru if is_diffusers_version(">=", "0.33.0"): SUPPORTED_OV_PIPELINES.append(OVSanaSprintPipeline) OV_TEXT2IMAGE_PIPELINES_MAPPING["sana-sprint"] = OVSanaSprintPipeline - - + + if is_diffusers_version(">", "0.34.0"): SUPPORTED_OV_PIPELINES.extend([OVFluxKontextPipeline]) OV_IMAGE2IMAGE_PIPELINES_MAPPING["flux"] = OVFluxKontextPipeline @@ -1794,10 +1689,7 @@ def _get_task_ov_class(mapping, pipeline_class_name): def _get_model_name(pipeline_class_name): for ov_pipelines_mapping in SUPPORTED_OV_PIPELINES_MAPPINGS: for model_name, ov_pipeline_class in ov_pipelines_mapping.items(): - if ( - ov_pipeline_class.__name__ == pipeline_class_name - or ov_pipeline_class.auto_model_class.__name__ == pipeline_class_name - ): + if ov_pipeline_class.__name__ == pipeline_class_name or ov_pipeline_class.auto_model_class.__name__ == pipeline_class_name: return model_name model_name = _get_model_name(pipeline_class_name) diff --git a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py index 4845732922..e1b2afe3df 100644 --- a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py +++ b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py @@ -255,7 +255,8 @@ def __init__(self, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["openvino", "diffusers"]) - + + class OVSanaPipeline(metaclass=DummyObject): _backends = ["openvino", "diffusers"] From 024ac2dc675c2b17e60fdddc43d60df990b224e7 Mon Sep 17 00:00:00 2001 From: ethan Date: Sun, 6 Jul 2025 20:31:05 -0700 Subject: [PATCH 03/15] add test case --- optimum/intel/openvino/modeling_diffusion.py | 2 +- tests/openvino/utils_tests.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 907cd4b158..a6f2aafb96 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -1675,7 +1675,7 @@ def _get_ov_class(pipeline_class_name: str, throw_error_if_not_exist: bool = Tru if is_diffusers_version(">", "0.34.0"): SUPPORTED_OV_PIPELINES.extend([OVFluxKontextPipeline]) - OV_IMAGE2IMAGE_PIPELINES_MAPPING["flux"] = OVFluxKontextPipeline + OV_IMAGE2IMAGE_PIPELINES_MAPPING["flux-kontext"] = OVFluxKontextPipeline SUPPORTED_OV_PIPELINES_MAPPINGS = [ OV_TEXT2IMAGE_PIPELINES_MAPPING, diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index fae7f08269..7fcc5e6714 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -76,6 +76,7 @@ "flaubert": "hf-internal-testing/tiny-random-flaubert", "flux": "katuni4ka/tiny-random-flux", "flux-fill": "katuni4ka/tiny-random-flux-fill", + "flux-kontext": "snake7gun/flux-kontext-random", "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", "gpt2": "hf-internal-testing/tiny-random-gpt2", "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", From c365139de486e5303279237a5c711babcc0a3d16 Mon Sep 17 00:00:00 2001 From: ethan Date: Sun, 6 Jul 2025 20:55:39 -0700 Subject: [PATCH 04/15] reformat --- optimum/intel/__init__.py | 8 +- optimum/intel/openvino/modeling_diffusion.py | 197 +++++++++++++++---- 2 files changed, 160 insertions(+), 45 deletions(-) diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 5bbf292f3f..cc08e3b09b 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -167,7 +167,9 @@ except OptionalDependencyNotAvailable: from .utils import dummy_openvino_objects - _import_structure["utils.dummy_openvino_objects"] = [name for name in dir(dummy_openvino_objects) if not name.startswith("_")] + _import_structure["utils.dummy_openvino_objects"] = [ + name for name in dir(dummy_openvino_objects) if not name.startswith("_") + ] else: _import_structure["openvino"].extend( [ @@ -204,7 +206,9 @@ except OptionalDependencyNotAvailable: from .utils import dummy_neural_compressor_objects - _import_structure["utils.dummy_neural_compressor_objects"] = [name for name in dir(dummy_neural_compressor_objects) if not name.startswith("_")] + _import_structure["utils.dummy_neural_compressor_objects"] = [ + name for name in dir(dummy_neural_compressor_objects) if not name.startswith("_") + ] else: _import_structure["neural_compressor"] = [ "INCConfig", diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index a6f2aafb96..04a51afd65 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -201,15 +201,35 @@ def __init__( ) self.unet = OVModelUnet(unet, self, DIFFUSION_MODEL_UNET_SUBFOLDER) if unet is not None else None - self.transformer = OVModelTransformer(transformer, self, DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER) if transformer is not None else None + self.transformer = ( + OVModelTransformer(transformer, self, DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER) + if transformer is not None + else None + ) if unet is None and transformer is None: raise ValueError("`unet` or `transformer` model should be provided for pipeline work") self.vae_decoder = OVModelVaeDecoder(vae_decoder, self, DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER) - self.vae_encoder = OVModelVaeEncoder(vae_encoder, self, DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER) if vae_encoder is not None else None - self.text_encoder = OVModelTextEncoder(text_encoder, self, DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER) if text_encoder is not None else None - self.text_encoder_2 = OVModelTextEncoder(text_encoder_2, self, DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER) if text_encoder_2 is not None else None - self.text_encoder_3 = OVModelTextEncoder(text_encoder_3, self, DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER) if text_encoder_3 is not None else None + self.vae_encoder = ( + OVModelVaeEncoder(vae_encoder, self, DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER) + if vae_encoder is not None + else None + ) + self.text_encoder = ( + OVModelTextEncoder(text_encoder, self, DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER) + if text_encoder is not None + else None + ) + self.text_encoder_2 = ( + OVModelTextEncoder(text_encoder_2, self, DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER) + if text_encoder_2 is not None + else None + ) + self.text_encoder_3 = ( + OVModelTextEncoder(text_encoder_3, self, DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER) + if text_encoder_3 is not None + else None + ) # We wrap the VAE Decoder & Encoder in a single object to simulate diffusers API self.vae = OVModelVae(decoder=self.vae_decoder, encoder=self.vae_encoder) @@ -290,7 +310,9 @@ def _save_pretrained(self, save_directory: Union[str, Path]): The directory where to save the model files """ if self._compile_only: - raise ValueError("`save_pretrained()` is not supported with `compile_only` mode, please initialize model without this option") + raise ValueError( + "`save_pretrained()` is not supported with `compile_only` mode, please initialize model without this option" + ) save_directory = Path(save_directory) @@ -308,7 +330,11 @@ def _save_pretrained(self, save_directory: Union[str, Path]): dst_path = save_path / OV_XML_FILE_NAME dst_path.parent.mkdir(parents=True, exist_ok=True) openvino.save_model(model.model, dst_path, compress_to_fp16=False) - model_dir = self.model_save_dir if not isinstance(self.model_save_dir, TemporaryDirectory) else self.model_save_dir.name + model_dir = ( + self.model_save_dir + if not isinstance(self.model_save_dir, TemporaryDirectory) + else self.model_save_dir.name + ) config_path = Path(model_dir) / save_path.name / CONFIG_NAME if config_path.is_file(): config_save_path = save_path / CONFIG_NAME @@ -339,7 +365,11 @@ def _save_config(self, save_directory): Saves a model configuration into a directory, so that it can be re-loaded using the [`from_pretrained`] class method. """ - model_dir = self.model_save_dir if not isinstance(self.model_save_dir, TemporaryDirectory) else self.model_save_dir.name + model_dir = ( + self.model_save_dir + if not isinstance(self.model_save_dir, TemporaryDirectory) + else self.model_save_dir.name + ) save_dir = Path(save_directory) original_config = Path(model_dir) / self.config_name if original_config.exists(): @@ -497,7 +527,11 @@ def _from_pretrained( ov_config = kwargs.get("ov_config", {}) device = kwargs.get("device", "CPU") vae_ov_conifg = {**ov_config} - if "GPU" in device.upper() and "INFERENCE_PRECISION_HINT" not in vae_ov_conifg and is_openvino_version("<=", "2025.0"): + if ( + "GPU" in device.upper() + and "INFERENCE_PRECISION_HINT" not in vae_ov_conifg + and is_openvino_version("<=", "2025.0") + ): vae_model_path = models["vae_decoder"] required_upcast = check_scale_available(vae_model_path) if required_upcast: @@ -638,10 +672,15 @@ def to(self, *args, device: Optional[str] = None, dtype: Optional[torch.dtype] = self._device = device.upper() self.clear_requests() elif device is not None: - raise ValueError("The `device` argument should be a string representing the device on which the model should be loaded.") + raise ValueError( + "The `device` argument should be a string representing the device on which the model should be loaded." + ) if dtype is not None and dtype != self.dtype: - raise NotImplementedError(f"Cannot change the dtype of the model from {self.dtype} to {dtype}. " f"Please export the model with the desired dtype.") + raise NotImplementedError( + f"Cannot change the dtype of the model from {self.dtype} to {dtype}. " + f"Please export the model with the desired dtype." + ) return self @@ -651,7 +690,9 @@ def height(self) -> int: height = model.inputs[0].get_partial_shape()[-2] if height.is_dynamic: return -1 - return height.get_length() * (self.vae_scale_factor if hasattr(self, "vae_scale_factor") else self.vae_spatial_compression_ratio) + return height.get_length() * ( + self.vae_scale_factor if hasattr(self, "vae_scale_factor") else self.vae_spatial_compression_ratio + ) @property def width(self) -> int: @@ -659,7 +700,9 @@ def width(self) -> int: width = model.inputs[0].get_partial_shape()[-1] if width.is_dynamic: return -1 - return width.get_length() * (self.vae_scale_factor if hasattr(self, "vae_scale_factor") else self.vae_spatial_compression_ratio) + return width.get_length() * ( + self.vae_scale_factor if hasattr(self, "vae_scale_factor") else self.vae_spatial_compression_ratio + ) @property def batch_size(self) -> int: @@ -755,7 +798,9 @@ def _reshape_transformer( elif inputs.get_any_name() == "hidden_states": in_channels = self.transformer.config.get("in_channels", None) if in_channels is None: - in_channels = shapes[inputs][1] if inputs.get_partial_shape().rank.get_length() == 4 else shapes[inputs][2] + in_channels = ( + shapes[inputs][1] if inputs.get_partial_shape().rank.get_length() == 4 else shapes[inputs][2] + ) if in_channels.is_dynamic: logger.warning( "Could not identify `in_channels` from the unet configuration, to statically reshape the unet please provide a configuration." @@ -769,7 +814,11 @@ def _reshape_transformer( elif inputs.get_any_name() == "pooled_projections": shapes[inputs] = [batch_size, self.transformer.config["pooled_projection_dim"]] elif inputs.get_any_name() == "img_ids": - shapes[inputs] = [batch_size, packed_height_width, 3] if is_diffusers_version("<", "0.31.0") else [packed_height_width, 3] + shapes[inputs] = ( + [batch_size, packed_height_width, 3] + if is_diffusers_version("<", "0.31.0") + else [packed_height_width, 3] + ) elif inputs.get_any_name() == "txt_ids": shapes[inputs] = [batch_size, -1, 3] if is_diffusers_version("<", "0.31.0") else [-1, 3] elif inputs.get_any_name() in ["height", "width", "num_frames", "rope_interpolation_scale"]: @@ -843,7 +892,9 @@ def _reshape_vae_decoder( def reshape(self, batch_size: int, height: int, width: int, num_images_per_prompt: int = -1, num_frames: int = -1): if self._compile_only: - raise ValueError("`reshape()` is not supported with `compile_only` mode, please initialize model without this option") + raise ValueError( + "`reshape()` is not supported with `compile_only` mode, please initialize model without this option" + ) self.is_dynamic = -1 in {batch_size, height, width, num_images_per_prompt} @@ -854,11 +905,15 @@ def reshape(self, batch_size: int, height: int, width: int, num_images_per_promp tokenizer_max_len = -1 else: tokenizer_max_len = ( - getattr(self.tokenizer, "model_max_length", -1) if self.tokenizer is not None else getattr(self.tokenizer_2, "model_max_length", -1) + getattr(self.tokenizer, "model_max_length", -1) + if self.tokenizer is not None + else getattr(self.tokenizer_2, "model_max_length", -1) ) if self.unet is not None: - self.unet.model = self._reshape_unet(self.unet.model, batch_size, height, width, num_images_per_prompt, tokenizer_max_len) + self.unet.model = self._reshape_unet( + self.unet.model, batch_size, height, width, num_images_per_prompt, tokenizer_max_len + ) if self.transformer is not None: self.transformer.model = self._reshape_transformer( self.transformer.model, @@ -869,10 +924,14 @@ def reshape(self, batch_size: int, height: int, width: int, num_images_per_promp tokenizer_max_len, num_frames=num_frames, ) - self.vae_decoder.model = self._reshape_vae_decoder(self.vae_decoder.model, height, width, num_images_per_prompt, num_frames=num_frames) + self.vae_decoder.model = self._reshape_vae_decoder( + self.vae_decoder.model, height, width, num_images_per_prompt, num_frames=num_frames + ) if self.vae_encoder is not None: - self.vae_encoder.model = self._reshape_vae_encoder(self.vae_encoder.model, batch_size, height, width, num_frames=num_frames) + self.vae_encoder.model = self._reshape_vae_encoder( + self.vae_encoder.model, batch_size, height, width, num_frames=num_frames + ) if self.text_encoder is not None: self.text_encoder.model = self._reshape_text_encoder( @@ -881,13 +940,16 @@ def reshape(self, batch_size: int, height: int, width: int, num_images_per_promp batch_size, ( getattr(self.tokenizer, "model_max_length", -1) - if "Gemma" not in self.tokenizer.__class__.__name__ and not self.__class__.__name__.startswith("OVLTX") + if "Gemma" not in self.tokenizer.__class__.__name__ + and not self.__class__.__name__.startswith("OVLTX") else -1 ), ) if self.text_encoder_2 is not None: - self.text_encoder_2.model = self._reshape_text_encoder(self.text_encoder_2.model, batch_size, getattr(self.tokenizer_2, "model_max_length", -1)) + self.text_encoder_2.model = self._reshape_text_encoder( + self.text_encoder_2.model, batch_size, getattr(self.tokenizer_2, "model_max_length", -1) + ) if self.text_encoder_3 is not None: self.text_encoder_3.model = self._reshape_text_encoder(self.text_encoder_3.model, batch_size, -1) @@ -900,7 +962,9 @@ def half(self): Converts all the model weights to FP16 for more efficient inference on GPU. """ if self._compile_only: - raise ValueError("`half()` is not supported with `compile_only` mode, please initialize model without this option") + raise ValueError( + "`half()` is not supported with `compile_only` mode, please initialize model without this option" + ) for submodel in self.ov_submodels.values(): compress_model_transformation(submodel) @@ -911,7 +975,9 @@ def half(self): def clear_requests(self): if self._compile_only: - raise ValueError("`clear_requests()` is not supported with `compile_only` mode, please initialize model without this option") + raise ValueError( + "`clear_requests()` is not supported with `compile_only` mode, please initialize model without this option" + ) for submodel_name in self._ov_submodel_names: getattr(self, submodel_name).request = None @@ -998,7 +1064,9 @@ def __call__(self, *args, **kwargs): # Disable this behavior for static shape pipeline if self.auto_model_class.__name__.startswith("Sana") and shapes_overridden: sig_resolution_bining_idx = ( - list(sig.parameters).index("use_resolution_binning") if "use_resolution_binning" in sig.parameters else len(sig.parameters) + list(sig.parameters).index("use_resolution_binning") + if "use_resolution_binning" in sig.parameters + else len(sig.parameters) ) if len(args) > sig_resolution_bining_idx: args[sig_resolution_bining_idx] = False @@ -1052,7 +1120,11 @@ def dtype(self) -> torch.dtype: def _compile(self): if self.request is None: - if "CACHE_DIR" not in self.ov_config.keys() and not str(self.model_save_dir).startswith(gettempdir()) and "GPU" in self._device: + if ( + "CACHE_DIR" not in self.ov_config.keys() + and not str(self.model_save_dir).startswith(gettempdir()) + and "GPU" in self._device + ): self.ov_config["CACHE_DIR"] = os.path.join(self.model_save_dir, "model_cache") logger.info(f"Compiling the {self.model_name} to {self._device} ...") @@ -1072,10 +1144,15 @@ def to(self, *args, device: Optional[str] = None, dtype: Optional[torch.dtype] = self._device = device.upper() self.request = None elif device is not None: - raise ValueError("The `device` argument should be a string representing the device on which the model should be loaded.") + raise ValueError( + "The `device` argument should be a string representing the device on which the model should be loaded." + ) if dtype is not None and dtype != self.dtype: - raise NotImplementedError(f"Cannot change the dtype of the model from {self.dtype} to {dtype}. " f"Please export the model with the desired dtype.") + raise NotImplementedError( + f"Cannot change the dtype of the model from {self.dtype} to {dtype}. " + f"Please export the model with the desired dtype." + ) return self @@ -1093,7 +1170,9 @@ def modules(self): class OVModelTextEncoder(OVPipelinePart): def __init__(self, model: openvino.Model, parent_pipeline: OVDiffusionPipeline, model_name: str = ""): super().__init__(model, parent_pipeline, model_name) - self.hidden_states_output_names = [name for out in self.model.outputs for name in out.names if name.startswith("hidden_states")] + self.hidden_states_output_names = [ + name for out in self.model.outputs for name in out.names if name.startswith("hidden_states") + ] self.input_names = [inp.get_any_name() for inp in self.model.inputs] def forward( @@ -1117,7 +1196,11 @@ def forward( model_outputs["pooler_output"] = torch.from_numpy(ov_outputs[1]) if self.hidden_states_output_names and "last_hidden_state" not in model_outputs: model_outputs["last_hidden_state"] = torch.from_numpy(ov_outputs[self.hidden_states_output_names[-1]]) - if self.hidden_states_output_names and output_hidden_states or getattr(self.config, "output_hidden_states", False): + if ( + self.hidden_states_output_names + and output_hidden_states + or getattr(self.config, "output_hidden_states", False) + ): hidden_states = [torch.from_numpy(ov_outputs[out_name]) for out_name in self.hidden_states_output_names] model_outputs["hidden_states"] = hidden_states @@ -1274,7 +1357,9 @@ def forward( model_outputs["latents"] = model_outputs.pop("latent_sample") if "latent_parameters" in model_outputs: - model_outputs["latent_dist"] = DiagonalGaussianDistribution(parameters=model_outputs.pop("latent_parameters")) + model_outputs["latent_dist"] = DiagonalGaussianDistribution( + parameters=model_outputs.pop("latent_parameters") + ) if return_dict: return model_outputs @@ -1391,7 +1476,9 @@ class OVStableDiffusionPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMix auto_model_class = StableDiffusionPipeline -class OVStableDiffusionImg2ImgPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionImg2ImgPipeline): +class OVStableDiffusionImg2ImgPipeline( + OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionImg2ImgPipeline +): """ OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_img2img#diffusers.StableDiffusionImg2ImgPipeline). """ @@ -1401,7 +1488,9 @@ class OVStableDiffusionImg2ImgPipeline(OVDiffusionPipeline, OVTextualInversionLo auto_model_class = StableDiffusionImg2ImgPipeline -class OVStableDiffusionInpaintPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionInpaintPipeline): +class OVStableDiffusionInpaintPipeline( + OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionInpaintPipeline +): """ OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_inpaint#diffusers.StableDiffusionInpaintPipeline). """ @@ -1434,7 +1523,9 @@ def _get_add_time_ids( return add_time_ids -class OVStableDiffusionXLImg2ImgPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionXLImg2ImgPipeline): +class OVStableDiffusionXLImg2ImgPipeline( + OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionXLImg2ImgPipeline +): """ OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline). """ @@ -1458,7 +1549,9 @@ def _get_add_time_ids( ): if self.config.requires_aesthetics_score: add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) - add_neg_time_ids = list(negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)) + add_neg_time_ids = list( + negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,) + ) else: add_time_ids = list(original_size + crops_coords_top_left + target_size) add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size) @@ -1469,7 +1562,9 @@ def _get_add_time_ids( return add_time_ids, add_neg_time_ids -class OVStableDiffusionXLInpaintPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionXLInpaintPipeline): +class OVStableDiffusionXLInpaintPipeline( + OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionXLInpaintPipeline +): """ OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLInpaintPipeline). """ @@ -1493,7 +1588,9 @@ def _get_add_time_ids( ): if self.config.requires_aesthetics_score: add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) - add_neg_time_ids = list(negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)) + add_neg_time_ids = list( + negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,) + ) else: add_time_ids = list(original_size + crops_coords_top_left + target_size) add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size) @@ -1504,7 +1601,9 @@ def _get_add_time_ids( return add_time_ids, add_neg_time_ids -class OVLatentConsistencyModelPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, LatentConsistencyModelPipeline): +class OVLatentConsistencyModelPipeline( + OVDiffusionPipeline, OVTextualInversionLoaderMixin, LatentConsistencyModelPipeline +): """ OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline). """ @@ -1514,7 +1613,9 @@ class OVLatentConsistencyModelPipeline(OVDiffusionPipeline, OVTextualInversionLo auto_model_class = LatentConsistencyModelPipeline -class OVLatentConsistencyModelImg2ImgPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, LatentConsistencyModelImg2ImgPipeline): +class OVLatentConsistencyModelImg2ImgPipeline( + OVDiffusionPipeline, OVTextualInversionLoaderMixin, LatentConsistencyModelImg2ImgPipeline +): """ OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency_img2img#diffusers.LatentConsistencyModelImg2ImgPipeline). """ @@ -1530,13 +1631,17 @@ class OVStableDiffusion3Pipeline(OVDiffusionPipeline, OVTextualInversionLoaderMi auto_model_class = StableDiffusion3Pipeline -class OVStableDiffusion3Img2ImgPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusion3Img2ImgPipeline): +class OVStableDiffusion3Img2ImgPipeline( + OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusion3Img2ImgPipeline +): main_input_name = "image" export_feature = "image-to-image" auto_model_class = StableDiffusion3Img2ImgPipeline -class OVStableDiffusion3InpaintPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusion3InpaintPipeline): +class OVStableDiffusion3InpaintPipeline( + OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusion3InpaintPipeline +): main_input_name = "image" export_feature = "inpainting" auto_model_class = StableDiffusion3InpaintPipeline @@ -1604,7 +1709,10 @@ class OVLTXPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, LTXPipel def _get_ov_class(pipeline_class_name: str, throw_error_if_not_exist: bool = True): for ov_pipeline_class in SUPPORTED_OV_PIPELINES: - if ov_pipeline_class.__name__ == pipeline_class_name or ov_pipeline_class.auto_model_class.__name__ == pipeline_class_name: + if ( + ov_pipeline_class.__name__ == pipeline_class_name + or ov_pipeline_class.auto_model_class.__name__ == pipeline_class_name + ): return ov_pipeline_class if throw_error_if_not_exist: @@ -1689,7 +1797,10 @@ def _get_task_ov_class(mapping, pipeline_class_name): def _get_model_name(pipeline_class_name): for ov_pipelines_mapping in SUPPORTED_OV_PIPELINES_MAPPINGS: for model_name, ov_pipeline_class in ov_pipelines_mapping.items(): - if ov_pipeline_class.__name__ == pipeline_class_name or ov_pipeline_class.auto_model_class.__name__ == pipeline_class_name: + if ( + ov_pipeline_class.__name__ == pipeline_class_name + or ov_pipeline_class.auto_model_class.__name__ == pipeline_class_name + ): return model_name model_name = _get_model_name(pipeline_class_name) From 051a8da93fb09cf79e85ba5f4e98aeacdb521c10 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 7 Jul 2025 08:43:12 +0000 Subject: [PATCH 05/15] Apply style fixes --- optimum/intel/openvino/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index f9a6f39c52..5565905503 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -90,8 +90,8 @@ OVFluxFillPipeline, OVFluxImg2ImgPipeline, OVFluxInpaintPipeline, - OVFluxPipeline, OVFluxKontextPipeline, + OVFluxPipeline, OVLatentConsistencyModelImg2ImgPipeline, OVLatentConsistencyModelPipeline, OVLTXPipeline, From c49252f08d517b514041b4d8a9724f691cec18f2 Mon Sep 17 00:00:00 2001 From: ethan Date: Mon, 7 Jul 2025 19:39:04 -0700 Subject: [PATCH 06/15] add diffusion and export test --- tests/openvino/test_diffusion.py | 5 +++-- tests/openvino/test_exporters_cli.py | 3 +++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index 5b1d3af656..8859c8f7e4 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -483,6 +483,7 @@ class OVPipelineForImage2ImageTest(unittest.TestCase): if is_transformers_version(">=", "4.40.0"): SUPPORTED_ARCHITECTURES.append("stable-diffusion-3") SUPPORTED_ARCHITECTURES.append("flux") + SUPPORTED_ARCHITECTURES.append("flux-kontext") AUTOMODEL_CLASS = AutoPipelineForImage2Image OVMODEL_CLASS = OVPipelineForImage2Image @@ -496,7 +497,7 @@ def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_ height=height, width=width, batch_size=batch_size, channel=channel, input_type=input_type ) - if model_type in ["flux", "stable-diffusion-3"]: + if model_type in ["flux", "stable-diffusion-3", "flux-kontext"]: inputs["height"] = height inputs["width"] = width @@ -583,7 +584,7 @@ def test_shape(self, model_arch: str): elif output_type == "pt": self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: - if model_arch != "flux": + if model_arch != "flux" and model_arch != "flux-kontext": out_channels = ( pipeline.unet.config.out_channels if pipeline.unet is not None diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 83b5b1e80d..6b00e1eedd 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -31,6 +31,7 @@ from optimum.exporters.openvino.utils import COMPLEX_CHAT_TEMPLATES from optimum.intel import ( # noqa OVFluxFillPipeline, + OVFluxKontextPipeline, OVFluxPipeline, OVLatentConsistencyModelPipeline, OVLTXPipeline, @@ -100,6 +101,7 @@ class OVCLIExportTestCase(unittest.TestCase): ("inpainting", "flux-fill"), ("text-to-image", "sana"), ("text-to-video", "ltx-video"), + ("image-to-image", "flux-kontext"), ] ) EXPECTED_NUMBER_OF_TOKENIZER_MODELS = { @@ -117,6 +119,7 @@ class OVCLIExportTestCase(unittest.TestCase): "stable-diffusion-3": 6 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 2, "flux": 4 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, "flux-fill": 4 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, + "flux-kontext": 4 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, "llava": 2 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, "sana": 2 if is_tokenizers_version("<", "0.20.0") or is_openvino_version(">=", "2024.5") else 0, "ltx-video": 2 if is_tokenizers_version("<", "0.20.0") or is_openvino_version(">=", "2024.5") else 0, From 8459565812fa69be50982d0931f5db4ad89d3e40 Mon Sep 17 00:00:00 2001 From: ethan Date: Tue, 8 Jul 2025 18:43:11 -0700 Subject: [PATCH 07/15] rebase the export cli test --- tests/openvino/test_diffusion.py | 4 +++- tests/openvino/test_exporters_cli.py | 2 -- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index 8859c8f7e4..63fc6b4803 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -24,6 +24,7 @@ AutoPipelineForInpainting, AutoPipelineForText2Image, DiffusionPipeline, + FluxKontextPipeline, ) from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker from diffusers.utils import load_image @@ -611,7 +612,8 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): height, width, batch_size = 128, 128, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, model_type=model_arch) - diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + auto_cls = self.AUTOMODEL_CLASS if "flux-kontext" not in model_arch else FluxKontextPipeline + diffusers_pipeline = auto_cls.from_pretrained(MODEL_NAMES[model_arch]) ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) for output_type in ["latent", "np", "pt"]: diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 6b00e1eedd..9e339f1d4f 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -101,7 +101,6 @@ class OVCLIExportTestCase(unittest.TestCase): ("inpainting", "flux-fill"), ("text-to-image", "sana"), ("text-to-video", "ltx-video"), - ("image-to-image", "flux-kontext"), ] ) EXPECTED_NUMBER_OF_TOKENIZER_MODELS = { @@ -119,7 +118,6 @@ class OVCLIExportTestCase(unittest.TestCase): "stable-diffusion-3": 6 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 2, "flux": 4 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, "flux-fill": 4 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, - "flux-kontext": 4 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, "llava": 2 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, "sana": 2 if is_tokenizers_version("<", "0.20.0") or is_openvino_version(">=", "2024.5") else 0, "ltx-video": 2 if is_tokenizers_version("<", "0.20.0") or is_openvino_version(">=", "2024.5") else 0, From 2c7e1dee1f2ed916ee163faac98d46818bcb417f Mon Sep 17 00:00:00 2001 From: ethan Date: Tue, 8 Jul 2025 22:48:16 -0700 Subject: [PATCH 08/15] add quantize test case --- tests/openvino/test_quantization.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 26c6283b85..f2933dcada 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -42,6 +42,7 @@ from optimum.intel import ( OVConfig, OVFluxPipeline, + OVFluxKontextPipeline, OVLatentConsistencyModelPipeline, OVModelForAudioClassification, OVModelForCausalLM, @@ -1039,6 +1040,7 @@ class OVWeightCompressionTest(unittest.TestCase): (OVStableDiffusion3Pipeline, "stable-diffusion-3", 9, 65), (OVFluxPipeline, "flux", 7, 56), (OVSanaPipeline, "sana", 19, 53), + (OVFluxKontextPipeline, "flux-kontext", 19, 53), ] ) From c160e7fbb03d83112fb738721e90ec502e6c6d3a Mon Sep 17 00:00:00 2001 From: ethan Date: Tue, 8 Jul 2025 23:26:15 -0700 Subject: [PATCH 09/15] delete cli export test --- tests/openvino/test_exporters_cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 9e339f1d4f..83b5b1e80d 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -31,7 +31,6 @@ from optimum.exporters.openvino.utils import COMPLEX_CHAT_TEMPLATES from optimum.intel import ( # noqa OVFluxFillPipeline, - OVFluxKontextPipeline, OVFluxPipeline, OVLatentConsistencyModelPipeline, OVLTXPipeline, From 1e37d3d9a2068ba82fe97b49dfe7adb9cc0b58db Mon Sep 17 00:00:00 2001 From: ethan Date: Tue, 19 Aug 2025 21:13:34 -0700 Subject: [PATCH 10/15] support diffusers 3.5 --- optimum/intel/openvino/modeling_diffusion.py | 2 +- tests/openvino/test_exporters_cli.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 72752b2c65..2839e7fb7a 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -121,7 +121,7 @@ else: SanaSprintPipeline = object -if is_diffusers_version(">", "0.34.0"): +if is_diffusers_version(">=", "0.35.0"): from diffusers import FluxKontextPipeline else: FluxKontextPipeline = object diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index eb51b99568..41064af092 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -33,6 +33,7 @@ from optimum.intel import ( # noqa OVFluxFillPipeline, OVFluxPipeline, + OVFluxKontextPipeline, OVLatentConsistencyModelPipeline, OVLTXPipeline, OVModelForAudioClassification, @@ -110,6 +111,7 @@ class OVCLIExportTestCase(unittest.TestCase): ("text-to-image", "stable-diffusion-3"), ("text-to-image", "flux"), ("inpainting", "flux-fill"), + ("image-to-image", "flux-kontext"), ("text-to-image", "sana"), ("text-to-video", "ltx-video"), ] @@ -137,6 +139,7 @@ class OVCLIExportTestCase(unittest.TestCase): "stable-diffusion-3": 6 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 2, "flux": 4 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, "flux-fill": 4 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, + "flux-kontext": 4 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, "llava": 2 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, "sana": 2 if is_tokenizers_version("<", "0.20.0") or is_openvino_version(">=", "2024.5") else 0, "ltx-video": 2 if is_tokenizers_version("<", "0.20.0") or is_openvino_version(">=", "2024.5") else 0, From cb7df16372d34c661855ef4567d45aeaec19438e Mon Sep 17 00:00:00 2001 From: ethan Date: Tue, 9 Sep 2025 21:05:01 -0700 Subject: [PATCH 11/15] update diffusion test with static shape update diffusion test with static shape --- optimum/intel/openvino/modeling_diffusion.py | 4 +-- tests/openvino/test_diffusion.py | 35 ++++++++++++++------ 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 2839e7fb7a..d558184693 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -1791,8 +1791,8 @@ def _get_ov_class(pipeline_class_name: str, throw_error_if_not_exist: bool = Tru OV_TEXT2IMAGE_PIPELINES_MAPPING["sana-sprint"] = OVSanaSprintPipeline -if is_diffusers_version(">", "0.34.0"): - SUPPORTED_OV_PIPELINES.extend([OVFluxKontextPipeline]) +if is_diffusers_version(">=", "0.34.0"): + SUPPORTED_OV_PIPELINES.append(OVFluxKontextPipeline) OV_IMAGE2IMAGE_PIPELINES_MAPPING["flux-kontext"] = OVFluxKontextPipeline SUPPORTED_OV_PIPELINES_MAPPINGS = [ diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index d100cf8944..cbdb79a519 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -24,7 +24,6 @@ AutoPipelineForInpainting, AutoPipelineForText2Image, DiffusionPipeline, - FluxKontextPipeline, ) from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker from diffusers.utils import load_image @@ -485,7 +484,8 @@ class OVPipelineForImage2ImageTest(unittest.TestCase): if is_transformers_version(">=", "4.40.0"): SUPPORTED_ARCHITECTURES.append("stable-diffusion-3") SUPPORTED_ARCHITECTURES.append("flux") - SUPPORTED_ARCHITECTURES.append("flux-kontext") + if is_diffusers_version(">=", "0.35.0"): + SUPPORTED_ARCHITECTURES.append("flux-kontext") AUTOMODEL_CLASS = AutoPipelineForImage2Image OVMODEL_CLASS = OVPipelineForImage2Image @@ -502,8 +502,9 @@ def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_ if model_type in ["flux", "stable-diffusion-3", "flux-kontext"]: inputs["height"] = height inputs["width"] = width - - inputs["strength"] = 0.75 + + if model_type != "flux-kontext": + inputs["strength"] = 0.75 return inputs @@ -535,7 +536,16 @@ def test_num_images_per_prompt(self, model_arch: str): height=height, width=width, batch_size=batch_size, model_type=model_arch ) outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images - self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) + if model_arch != "flux-kontext": + self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) + else: + # output shape is fixed: https://github.com/huggingface/diffusers/blob/v0.35.1/src/diffusers/pipelines/flux/pipeline_flux_kontext.py#L882 + if (height == width): + self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, 1024, 1024, 3)) + elif (height > width): + self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, 1448, 724, 3)) + else: + self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, 724, 1448, 3)) @parameterized.expand(["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]) @require_diffusers @@ -568,8 +578,11 @@ def __call__(self, *args, **kwargs) -> None: @require_diffusers def test_shape(self, model_arch: str): pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - - height, width, batch_size = 128, 64, 1 + if model_arch != "flux-kontext": + # output shape is fixed: https://github.com/huggingface/diffusers/blob/v0.35.1/src/diffusers/pipelines/flux/pipeline_flux_kontext.py#L882 + height, width, batch_size = 128, 64, 1 + else: + height, width, batch_size = 1448, 724, 1 for input_type in ["pil", "np", "pt"]: inputs = self.generate_inputs( @@ -586,7 +599,7 @@ def test_shape(self, model_arch: str): elif output_type == "pt": self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: - if model_arch != "flux" and model_arch != "flux-kontext": + if not model_arch.startswith("flux"): out_channels = ( pipeline.unet.config.out_channels if pipeline.unet is not None @@ -611,9 +624,9 @@ def test_shape(self, model_arch: str): @require_diffusers def test_compare_to_diffusers_pipeline(self, model_arch: str): height, width, batch_size = 128, 128, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, model_type=model_arch) - - auto_cls = self.AUTOMODEL_CLASS if "flux-kontext" not in model_arch else FluxKontextPipeline + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, model_type=model_arch) + auto_cls = self.AUTOMODEL_CLASS + diffusers_pipeline = auto_cls.from_pretrained(MODEL_NAMES[model_arch]) ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) From 8b8e2c821c8533e25577ccf457ba31a038796bc9 Mon Sep 17 00:00:00 2001 From: ethan Date: Wed, 10 Sep 2025 09:06:45 -0700 Subject: [PATCH 12/15] remove quantization test --- tests/openvino/test_exporters_cli.py | 8 +++++++- tests/openvino/test_quantization.py | 2 -- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 41064af092..4abbfe5eb0 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -67,6 +67,7 @@ is_openvino_version, is_tokenizers_version, is_transformers_version, + is_diffusers_version, ) @@ -111,11 +112,16 @@ class OVCLIExportTestCase(unittest.TestCase): ("text-to-image", "stable-diffusion-3"), ("text-to-image", "flux"), ("inpainting", "flux-fill"), - ("image-to-image", "flux-kontext"), ("text-to-image", "sana"), ("text-to-video", "ltx-video"), ] ) + if is_diffusers_version(">=", "0.35.0"): + SUPPORTED_ARCHITECTURES.extend( + [ + ("image-to-image", "flux-kontext"), + ] + ) if is_transformers_version(">=", "4.54"): SUPPORTED_ARCHITECTURES.extend( diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index c5bb46c34a..c9497e804e 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -42,7 +42,6 @@ from optimum.intel import ( OVConfig, OVFluxPipeline, - OVFluxKontextPipeline, OVLatentConsistencyModelPipeline, OVModelForAudioClassification, OVModelForCausalLM, @@ -1134,7 +1133,6 @@ class OVWeightCompressionTest(unittest.TestCase): (OVStableDiffusion3Pipeline, "stable-diffusion-3", 9, 65), (OVFluxPipeline, "flux", 7, 56), (OVSanaPipeline, "sana", 19, 53), - (OVFluxKontextPipeline, "flux-kontext", 19, 53), ] ) From dd534af28d160e3e9723b77d9b95734cb2d354cb Mon Sep 17 00:00:00 2001 From: ethan Date: Wed, 17 Sep 2025 19:55:20 -0700 Subject: [PATCH 13/15] add flux-kontext to _HEAD_TO_AUTOMODELS --- optimum/intel/openvino/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index ac60ce516c..9cfbd26dd7 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -139,6 +139,7 @@ "sana": "OVSanaPipeline", "flux": "OVFluxPipeline", "flux-fill": "OVFluxFillPipeline", + "flux-kontext": "OVFluxKontextPipeline", "pix2struct": "OVModelForPix2Struct", "latent-consistency": "OVLatentConsistencyModelPipeline", "open_clip_text": "OVModelOpenCLIPText", From bcd50709a067d25e2a05e7b7bfd43534fd4325f6 Mon Sep 17 00:00:00 2001 From: ethan Date: Thu, 18 Sep 2025 08:33:39 -0700 Subject: [PATCH 14/15] update for kontext --- tests/openvino/utils_tests.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 8bf052910e..dfa99eb6d6 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -267,6 +267,13 @@ "text_encoder": 64, "text_encoder_2": 64, }, + "flux-kontext": { + "transformer": 56, + "vae_decoder": 28, + "vae_encoder": 24, + "text_encoder": 64, + "text_encoder_2": 64, + }, "llava": { "lm_model": 30, "text_embeddings_model": 1, From 4ae8de2ca6fdd39bbcda57264b77db3f5fe41338 Mon Sep 17 00:00:00 2001 From: ethan Date: Sat, 27 Sep 2025 08:00:38 -0700 Subject: [PATCH 15/15] update int8 test case --- tests/openvino/utils_tests.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index dfa99eb6d6..1a1ad79552 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -268,11 +268,11 @@ "text_encoder_2": 64, }, "flux-kontext": { - "transformer": 56, - "vae_decoder": 28, - "vae_encoder": 24, + "transformer": 60, + "vae_decoder": 30, + "vae_encoder": 26, "text_encoder": 64, - "text_encoder_2": 64, + "text_encoder_2": 76, }, "llava": { "lm_model": 30,