diff --git a/src/diffusers/modular_pipelines/flux/modular_pipeline.py b/src/diffusers/modular_pipelines/flux/modular_pipeline.py
index e97445d411e4..7d869041f2a9 100644
--- a/src/diffusers/modular_pipelines/flux/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/flux/modular_pipeline.py
@@ -32,6 +32,8 @@ class FluxModularPipeline(ModularPipeline, FluxLoraLoaderMixin, TextualInversion
"""
+ default_blocks_name = "FluxAutoBlocks"
+
@property
def default_height(self):
return self.default_sample_size * self.vae_scale_factor
diff --git a/src/diffusers/modular_pipelines/mellon_node_utils.py b/src/diffusers/modular_pipelines/mellon_node_utils.py
new file mode 100644
index 000000000000..a405aebee221
--- /dev/null
+++ b/src/diffusers/modular_pipelines/mellon_node_utils.py
@@ -0,0 +1,763 @@
+import json
+import logging
+import os
+
+# Simple typed wrapper for parameter overrides
+from dataclasses import asdict, dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from huggingface_hub import create_repo, hf_hub_download
+from huggingface_hub.utils import (
+ EntryNotFoundError,
+ HfHubHTTPError,
+ RepositoryNotFoundError,
+ RevisionNotFoundError,
+ validate_hf_hub_args,
+)
+
+from ..utils import HUGGINGFACE_CO_RESOLVE_ENDPOINT, PushToHubMixin, extract_commit_hash
+from .modular_pipeline import ModularPipelineBlocks
+
+
+logger = logging.getLogger(__name__)
+
+
+SUPPORTED_NODE_TYPES = {"controlnet", "vae_encoder", "denoise", "text_encoder", "decoder"}
+
+
+# Mellon Input Parameters (runtime parameters, not models)
+MELLON_INPUT_PARAMS = {
+ # controlnet
+ "control_image": {
+ "label": "Control Image",
+ "type": "image",
+ "display": "input",
+ },
+ "controlnet_conditioning_scale": {
+ "label": "Scale",
+ "type": "float",
+ "default": 0.5,
+ "min": 0,
+ "max": 1,
+ },
+ "control_guidance_end": {
+ "label": "End",
+ "type": "float",
+ "default": 1.0,
+ "min": 0,
+ "max": 1,
+ },
+ "control_guidance_start": {
+ "label": "Start",
+ "type": "float",
+ "default": 0.0,
+ "min": 0,
+ "max": 1,
+ },
+ "controlnet": {
+ "label": "Controlnet",
+ "type": "custom_controlnet",
+ "display": "input",
+ },
+ "embeddings": {
+ "label": "Text Embeddings",
+ "display": "input",
+ "type": "embeddings",
+ },
+ "image": {
+ "label": "Image",
+ "type": "image",
+ "display": "input",
+ },
+ "negative_prompt": {
+ "label": "Negative Prompt",
+ "type": "string",
+ "default": "",
+ "display": "textarea",
+ },
+ "prompt": {
+ "label": "Prompt",
+ "type": "string",
+ "default": "",
+ "display": "textarea",
+ },
+ "guidance_scale": {
+ "label": "Guidance Scale",
+ "type": "float",
+ "display": "slider",
+ "default": 5,
+ "min": 1.0,
+ "max": 30.0,
+ "step": 0.1,
+ },
+ "height": {
+ "label": "Height",
+ "type": "int",
+ "default": 1024,
+ "min": 64,
+ "step": 8,
+ },
+ "image_latents": {
+ "label": "Image Latents",
+ "type": "latents",
+ "display": "input",
+ "onChange": {False: ["height", "width"], True: ["strength"]},
+ },
+ "latents": {
+ "label": "Latents",
+ "type": "latents",
+ "display": "input",
+ },
+ "num_inference_steps": {
+ "label": "Steps",
+ "type": "int",
+ "display": "slider",
+ "default": 25,
+ "min": 1,
+ "max": 100,
+ },
+ "seed": {
+ "label": "Seed",
+ "type": "int",
+ "display": "random",
+ "default": 0,
+ "min": 0,
+ "max": 4294967295,
+ },
+ "strength": {
+ "label": "Strength",
+ "type": "float",
+ "default": 0.5,
+ "min": 0.0,
+ "max": 1.0,
+ "step": 0.01,
+ },
+ "width": {
+ "label": "Width",
+ "type": "int",
+ "default": 1024,
+ "min": 64,
+ "step": 8,
+ },
+ "ip_adapter": {
+ "label": "IP Adapter",
+ "type": "custom_ip_adapter",
+ "display": "input",
+ },
+}
+
+# Mellon Model Parameters (diffusers_auto_model types)
+MELLON_MODEL_PARAMS = {
+ "scheduler": {
+ "label": "Scheduler",
+ "display": "input",
+ "type": "diffusers_auto_model",
+ },
+ "text_encoders": {
+ "label": "Text Encoders",
+ "type": "diffusers_auto_models",
+ "display": "input",
+ },
+ "unet": {
+ "label": "Unet",
+ "display": "input",
+ "type": "diffusers_auto_model",
+ "onSignal": {
+ "action": "signal",
+ "target": "guider",
+ },
+ },
+ "guider": {
+ "label": "Guider",
+ "display": "input",
+ "type": "custom_guider",
+ "onChange": {False: ["guidance_scale"], True: []},
+ },
+ "vae": {
+ "label": "VAE",
+ "display": "input",
+ "type": "diffusers_auto_model",
+ },
+ "controlnet": {
+ "label": "Controlnet Model",
+ "type": "diffusers_auto_model",
+ "display": "input",
+ },
+}
+
+# Mellon Output Parameters (display = "output")
+MELLON_OUTPUT_PARAMS = {
+ "embeddings": {
+ "label": "Text Embeddings",
+ "display": "output",
+ "type": "embeddings",
+ },
+ "images": {
+ "label": "Images",
+ "type": "image",
+ "display": "output",
+ },
+ "image_latents": {
+ "label": "Image Latents",
+ "type": "latents",
+ "display": "output",
+ },
+ "latents": {
+ "label": "Latents",
+ "type": "latents",
+ "display": "output",
+ },
+ "latents_preview": {
+ "label": "Latents Preview",
+ "display": "output",
+ "type": "latent",
+ },
+ "controlnet_out": {
+ "label": "Controlnet",
+ "display": "output",
+ "type": "controlnet",
+ },
+}
+
+
+# Default param selections per supported node_type
+# from MELLON_INPUT_PARAMS / MELLON_MODEL_PARAMS / MELLON_OUTPUT_PARAMS.
+NODE_TYPE_PARAMS_MAP = {
+ "controlnet": {
+ "inputs": [
+ "control_image",
+ "controlnet_conditioning_scale",
+ "control_guidance_start",
+ "control_guidance_end",
+ "height",
+ "width",
+ ],
+ "model_inputs": [
+ "controlnet",
+ "vae",
+ ],
+ "outputs": [
+ "controlnet",
+ ],
+ "block_names": ["controlnet_vae_encoder"],
+ },
+ "denoise": {
+ "inputs": [
+ "embeddings",
+ "width",
+ "height",
+ "seed",
+ "num_inference_steps",
+ "guidance_scale",
+ "image_latents",
+ "strength",
+ # custom adapters coming in as inputs
+ "controlnet",
+ # ip_adapter is optional and custom; include if available
+ "ip_adapter",
+ ],
+ "model_inputs": [
+ "unet",
+ "guider",
+ "scheduler",
+ ],
+ "outputs": [
+ "latents",
+ "latents_preview",
+ ],
+ "block_names": ["denoise"],
+ },
+ "vae_encoder": {
+ "inputs": [
+ "image",
+ "width",
+ "height",
+ ],
+ "model_inputs": [
+ "vae",
+ ],
+ "outputs": [
+ "image_latents",
+ ],
+ "block_names": ["vae_encoder"],
+ },
+ "text_encoder": {
+ "inputs": [
+ "prompt",
+ "negative_prompt",
+ # optional image prompt input supported in embeddings node
+ "image",
+ ],
+ "model_inputs": [
+ "text_encoders",
+ ],
+ "outputs": [
+ "embeddings",
+ ],
+ "block_names": ["text_encoder"],
+ },
+ "decoder": {
+ "inputs": [
+ "latents",
+ ],
+ "model_inputs": [
+ "vae",
+ ],
+ "outputs": [
+ "images",
+ ],
+ "block_names": ["decode"],
+ },
+}
+
+
+@dataclass(frozen=True)
+class MellonParam:
+ name: str
+ label: str
+ type: str
+ display: Optional[str] = None
+ default: Any = None
+ min: Optional[float] = None
+ max: Optional[float] = None
+ step: Optional[float] = None
+ options: Any = None
+ value: Any = None
+ fieldOptions: Optional[Dict[str, Any]] = None
+ onChange: Any = None
+ onSignal: Any = None
+ _map_to_input: Any = None # the block input name this parameter maps to
+
+ def to_dict(self) -> Dict[str, Any]:
+ data = asdict(self)
+ return {k: v for k, v in data.items() if not k.startswith("_") and v is not None}
+
+
+@dataclass
+class MellonNodeConfig(PushToHubMixin):
+ """
+ A MellonNodeConfig is a base class to build Mellon nodes UI with modular diffusers.
+
+
+
+ This is an experimental feature and is likely to change in the future.
+
+
+ """
+
+ inputs: List[Union[str, MellonParam]]
+ model_inputs: List[Union[str, MellonParam]]
+ outputs: List[Union[str, MellonParam]]
+ blocks_names: list[str]
+ node_type: str
+ config_name = "mellon_config.json"
+
+ def __post_init__(self):
+ if isinstance(self.inputs, list):
+ self.inputs = self._resolve_params_list(self.inputs, MELLON_INPUT_PARAMS)
+ if isinstance(self.model_inputs, list):
+ self.model_inputs = self._resolve_params_list(self.model_inputs, MELLON_MODEL_PARAMS)
+ if isinstance(self.outputs, list):
+ self.outputs = self._resolve_params_list(self.outputs, MELLON_OUTPUT_PARAMS)
+
+ @staticmethod
+ def _resolve_params_list(
+ params: List[Union[str, MellonParam]], default_map: Dict[str, Dict[str, Any]]
+ ) -> Dict[str, Dict[str, Any]]:
+ def _resolve_param(
+ param: Union[str, MellonParam], default_params_map: Dict[str, Dict[str, Any]]
+ ) -> Tuple[str, Dict[str, Any]]:
+ if isinstance(param, str):
+ if param not in default_params_map:
+ raise ValueError(f"Unknown param '{param}', please define a `MellonParam` object instead")
+ return param, default_params_map[param].copy()
+ elif isinstance(param, MellonParam):
+ param_dict = param.to_dict()
+ param_name = param_dict.pop("name")
+ return param_name, param_dict
+ else:
+ raise ValueError(
+ f"Unknown param type '{type(param)}', please use a string or a `MellonParam` object instead"
+ )
+
+ resolved = {}
+ for p in params:
+ logger.info(f" Resolving param: {p}")
+ name, cfg = _resolve_param(p, default_map)
+ if name in resolved:
+ raise ValueError(f"Duplicate param '{name}'")
+ resolved[name] = cfg
+ return resolved
+
+ @classmethod
+ @validate_hf_hub_args
+ def load_mellon_config(
+ cls,
+ pretrained_model_name_or_path: Union[str, os.PathLike],
+ return_unused_kwargs=False,
+ return_commit_hash=False,
+ **kwargs,
+ ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+ r"""
+ Load a model or scheduler configuration.
+
+ Parameters:
+ pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+ Can be either:
+
+ - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+ the Hub.
+ - A path to a *directory* (for example `./my_model_directory`) containing model weights saved with
+ [`~ConfigMixin.save_config`].
+
+ cache_dir (`Union[str, os.PathLike]`, *optional*):
+ Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+ is not used.
+ force_download (`bool`, *optional*, defaults to `False`):
+ Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+ cached versions if they exist.
+ proxies (`Dict[str, str]`, *optional*):
+ A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+ 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+ output_loading_info(`bool`, *optional*, defaults to `False`):
+ Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+ local_files_only (`bool`, *optional*, defaults to `False`):
+ Whether to only load local model weights and configuration files or not. If set to `True`, the model
+ won't be downloaded from the Hub.
+ token (`str` or *bool*, *optional*):
+ The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+ `diffusers-cli login` (stored in `~/.huggingface`) is used.
+ revision (`str`, *optional*, defaults to `"main"`):
+ The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+ allowed by Git.
+ subfolder (`str`, *optional*, defaults to `""`):
+ The subfolder location of a model file within a larger model repository on the Hub or locally.
+ return_unused_kwargs (`bool`, *optional*, defaults to `False):
+ Whether unused keyword arguments of the config are returned.
+ return_commit_hash (`bool`, *optional*, defaults to `False):
+ Whether the `commit_hash` of the loaded configuration are returned.
+
+ Returns:
+ `dict`:
+ A dictionary of all the parameters stored in a JSON configuration file.
+
+ """
+ cache_dir = kwargs.pop("cache_dir", None)
+ local_dir = kwargs.pop("local_dir", None)
+ local_dir_use_symlinks = kwargs.pop("local_dir_use_symlinks", "auto")
+ force_download = kwargs.pop("force_download", False)
+ proxies = kwargs.pop("proxies", None)
+ token = kwargs.pop("token", None)
+ local_files_only = kwargs.pop("local_files_only", False)
+ revision = kwargs.pop("revision", None)
+
+ pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+
+ if cls.config_name is None:
+ raise ValueError(
+ "`self.config_name` is not defined. Note that one should not load a config from "
+ "`ConfigMixin`. Please make sure to define `config_name` in a class inheriting from `ConfigMixin`"
+ )
+ if os.path.isfile(pretrained_model_name_or_path):
+ config_file = pretrained_model_name_or_path
+ elif os.path.isdir(pretrained_model_name_or_path):
+ if os.path.isfile(os.path.join(pretrained_model_name_or_path, cls.config_name)):
+ # Load from a PyTorch checkpoint
+ config_file = os.path.join(pretrained_model_name_or_path, cls.config_name)
+ else:
+ raise EnvironmentError(
+ f"Error no file named {cls.config_name} found in directory {pretrained_model_name_or_path}."
+ )
+ else:
+ try:
+ # Load from URL or cache if already cached
+ config_file = hf_hub_download(
+ pretrained_model_name_or_path,
+ filename=cls.config_name,
+ cache_dir=cache_dir,
+ force_download=force_download,
+ proxies=proxies,
+ local_files_only=local_files_only,
+ token=token,
+ revision=revision,
+ local_dir=local_dir,
+ local_dir_use_symlinks=local_dir_use_symlinks,
+ )
+ except RepositoryNotFoundError:
+ raise EnvironmentError(
+ f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier"
+ " listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a"
+ " token having permission to this repo with `token` or log in with `hf auth login`."
+ )
+ except RevisionNotFoundError:
+ raise EnvironmentError(
+ f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for"
+ " this model name. Check the model page at"
+ f" 'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
+ )
+ except EntryNotFoundError:
+ raise EnvironmentError(
+ f"{pretrained_model_name_or_path} does not appear to have a file named {cls.config_name}."
+ )
+ except HfHubHTTPError as err:
+ raise EnvironmentError(
+ "There was a specific connection error when trying to load"
+ f" {pretrained_model_name_or_path}:\n{err}"
+ )
+ except ValueError:
+ raise EnvironmentError(
+ f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
+ f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
+ f" directory containing a {cls.config_name} file.\nCheckout your internet connection or see how to"
+ " run the library in offline mode at"
+ " 'https://huggingface.co/docs/diffusers/installation#offline-mode'."
+ )
+ except EnvironmentError:
+ raise EnvironmentError(
+ f"Can't load config for '{pretrained_model_name_or_path}'. If you were trying to load it from "
+ "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
+ f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
+ f"containing a {cls.config_name} file"
+ )
+ try:
+ with open(config_file, "r", encoding="utf-8") as reader:
+ text = reader.read()
+ config_dict = json.loads(text)
+
+ commit_hash = extract_commit_hash(config_file)
+ except (json.JSONDecodeError, UnicodeDecodeError):
+ raise EnvironmentError(f"It looks like the config file at '{config_file}' is not a valid JSON file.")
+
+ if not (return_unused_kwargs or return_commit_hash):
+ return config_dict
+
+ outputs = (config_dict,)
+
+ if return_unused_kwargs:
+ outputs += (kwargs,)
+
+ if return_commit_hash:
+ outputs += (commit_hash,)
+
+ return outputs
+
+ def save_mellon_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
+ """
+ Save the Mellon node definition to a JSON file.
+
+ Args:
+ save_directory (`str` or `os.PathLike`):
+ Directory where the configuration JSON file is saved (will be created if it does not exist).
+ push_to_hub (`bool`, *optional*, defaults to `False`):
+ Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
+ repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+ namespace).
+ kwargs (`Dict[str, Any]`, *optional*):
+ Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+ """
+ if os.path.isfile(save_directory):
+ raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+ os.makedirs(save_directory, exist_ok=True)
+
+ # If we save using the predefined names, we can load using `from_config`
+ output_config_file = os.path.join(save_directory, self.config_name)
+
+ self.to_json_file(output_config_file)
+ logger.info(f"Mellon node definition saved in {output_config_file}")
+
+ if push_to_hub:
+ commit_message = kwargs.pop("commit_message", None)
+ private = kwargs.pop("private", None)
+ create_pr = kwargs.pop("create_pr", False)
+ token = kwargs.pop("token", None)
+ repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+ repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
+ subfolder = kwargs.pop("subfolder", None)
+
+ self._upload_folder(
+ save_directory,
+ repo_id,
+ token=token,
+ commit_message=commit_message,
+ create_pr=create_pr,
+ subfolder=subfolder,
+ )
+
+ def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+ """
+ Save the Mellon schema dictionary to a JSON file.
+
+ Args:
+ json_file_path (`str` or `os.PathLike`):
+ Path to the JSON file to save a configuration instance's parameters.
+ """
+ with open(json_file_path, "w", encoding="utf-8") as writer:
+ writer.write(self.to_json_string())
+
+ def to_json_string(self) -> str:
+ """
+ Serializes this instance to a JSON string of the Mellon schema dict.
+
+ Args:
+ Returns:
+ `str`: String containing all the attributes that make up this configuration instance in JSON format.
+ """
+
+ mellon_dict = self.to_mellon_dict()
+ return json.dumps(mellon_dict, indent=2, sort_keys=True) + "\n"
+
+ def to_mellon_dict(self) -> Dict[str, Any]:
+ """Return a JSON-serializable dict focusing on the Mellon schema fields only.
+
+ params is a single flat dict composed as: {**inputs, **model_inputs, **outputs}.
+ """
+ # inputs/model_inputs/outputs are already normalized dicts
+ merged_params = {}
+ merged_params.update(self.inputs or {})
+ merged_params.update(self.model_inputs or {})
+ merged_params.update(self.outputs or {})
+
+ return {
+ "node_type": self.node_type,
+ "blocks_names": self.blocks_names,
+ "params": merged_params,
+ }
+
+ @classmethod
+ def from_mellon_dict(cls, mellon_dict: Dict[str, Any]) -> "MellonNodeConfig":
+ """Create a config from a Mellon schema dict produced by to_mellon_dict().
+
+ Splits the flat params dict back into inputs/model_inputs/outputs using the known key spaces from
+ MELLON_INPUT_PARAMS / MELLON_MODEL_PARAMS / MELLON_OUTPUT_PARAMS. Unknown keys are treated as inputs by
+ default.
+ """
+ flat_params = mellon_dict.get("params", {})
+
+ inputs: Dict[str, Any] = {}
+ model_inputs: Dict[str, Any] = {}
+ outputs: Dict[str, Any] = {}
+
+ for param_name, param_dict in flat_params.items():
+ if param_dict.get("display", "") == "output":
+ outputs[param_name] = param_dict
+ elif param_dict.get("type", "") in ("diffusers_auto_model", "diffusers_auto_models"):
+ model_inputs[param_name] = param_dict
+ else:
+ inputs[param_name] = param_dict
+
+ return cls(
+ inputs=inputs,
+ model_inputs=model_inputs,
+ outputs=outputs,
+ blocks_names=mellon_dict.get("blocks_names", []),
+ node_type=mellon_dict.get("node_type"),
+ )
+
+ # YiYi Notes: not used yet
+ @classmethod
+ def from_blocks(cls, blocks: ModularPipelineBlocks, node_type: str) -> "MellonNodeConfig":
+ """
+ Create an instance from a ModularPipeline object. If a preset exists in NODE_TYPE_PARAMS_MAP for the node_type,
+ use it; otherwise fall back to deriving lists from the pipeline's expected inputs/components/outputs.
+ """
+ if node_type not in NODE_TYPE_PARAMS_MAP:
+ raise ValueError(f"Node type {node_type} not supported")
+
+ blocks_names = list(blocks.sub_blocks.keys())
+
+ default_node_config = NODE_TYPE_PARAMS_MAP[node_type]
+ inputs_list: List[Union[str, MellonParam]] = default_node_config.get("inputs", [])
+ model_inputs_list: List[Union[str, MellonParam]] = default_node_config.get("model_inputs", [])
+ outputs_list: List[Union[str, MellonParam]] = default_node_config.get("outputs", [])
+
+ for required_input_name in blocks.required_inputs:
+ if required_input_name not in inputs_list:
+ inputs_list.append(
+ MellonParam(
+ name=required_input_name, label=required_input_name, type=required_input_name, display="input"
+ )
+ )
+
+ for component_spec in blocks.expected_components:
+ if component_spec.name not in model_inputs_list:
+ model_inputs_list.append(
+ MellonParam(
+ name=component_spec.name,
+ label=component_spec.name,
+ type="diffusers_auto_model",
+ display="input",
+ )
+ )
+
+ return cls(
+ inputs=inputs_list,
+ model_inputs=model_inputs_list,
+ outputs=outputs_list,
+ blocks_names=blocks_names,
+ node_type=node_type,
+ )
+
+
+# Minimal modular registry for Mellon node configs
+class ModularMellonNodeRegistry:
+ """Registry mapping (pipeline class, blocks_name) -> list of MellonNodeConfig."""
+
+ def __init__(self):
+ self._registry = {}
+ self._initialized = False
+
+ def register(self, pipeline_cls: type, node_params: Dict[str, MellonNodeConfig]):
+ if not self._initialized:
+ _initialize_registry(self)
+ self._registry[pipeline_cls] = node_params
+
+ def get(self, pipeline_cls: type) -> MellonNodeConfig:
+ if not self._initialized:
+ _initialize_registry(self)
+ return self._registry.get(pipeline_cls, None)
+
+ def get_all(self) -> Dict[type, Dict[str, MellonNodeConfig]]:
+ if not self._initialized:
+ _initialize_registry(self)
+ return self._registry
+
+
+def _register_preset_node_types(
+ pipeline_cls, params_map: Dict[str, Dict[str, Any]], registry: ModularMellonNodeRegistry
+):
+ """Register all node-type presets for a given pipeline class from a params map."""
+ node_configs = {}
+ for node_type, spec in params_map.items():
+ node_config = MellonNodeConfig(
+ inputs=spec.get("inputs", []),
+ model_inputs=spec.get("model_inputs", []),
+ outputs=spec.get("outputs", []),
+ blocks_names=spec.get("block_names", []),
+ node_type=node_type,
+ )
+ node_configs[node_type] = node_config
+ registry.register(pipeline_cls, node_configs)
+
+
+def _initialize_registry(registry: ModularMellonNodeRegistry):
+ """Initialize the registry and register all available pipeline configs."""
+ print("Initializing registry")
+
+ registry._initialized = True
+
+ try:
+ from .qwenimage.modular_pipeline import QwenImageModularPipeline
+ from .qwenimage.node_utils import QwenImage_NODE_TYPES_PARAMS_MAP
+
+ _register_preset_node_types(QwenImageModularPipeline, QwenImage_NODE_TYPES_PARAMS_MAP, registry)
+ except Exception:
+ raise Exception("Failed to register QwenImageModularPipeline")
+
+ try:
+ from .stable_diffusion_xl.modular_pipeline import StableDiffusionXLModularPipeline
+ from .stable_diffusion_xl.node_utils import SDXL_NODE_TYPES_PARAMS_MAP
+
+ _register_preset_node_types(StableDiffusionXLModularPipeline, SDXL_NODE_TYPES_PARAMS_MAP, registry)
+ except Exception:
+ raise Exception("Failed to register StableDiffusionXLModularPipeline")
diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index 74ffc6234894..206d19f17371 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -51,6 +51,7 @@
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+# map regular pipeline to modular pipeline class name
MODULAR_PIPELINE_MAPPING = OrderedDict(
[
("stable-diffusion-xl", "StableDiffusionXLModularPipeline"),
@@ -61,16 +62,6 @@
]
)
-MODULAR_PIPELINE_BLOCKS_MAPPING = OrderedDict(
- [
- ("StableDiffusionXLModularPipeline", "StableDiffusionXLAutoBlocks"),
- ("WanModularPipeline", "WanAutoBlocks"),
- ("FluxModularPipeline", "FluxAutoBlocks"),
- ("QwenImageModularPipeline", "QwenImageAutoBlocks"),
- ("QwenImageEditModularPipeline", "QwenImageEditAutoBlocks"),
- ]
-)
-
@dataclass
class PipelineState:
@@ -423,7 +414,7 @@ def set_block_state(self, state: PipelineState, block_state: BlockState):
state.set(input_param.name, param, input_param.kwargs_type)
elif input_param.kwargs_type:
- # if it is a kwargs type, e.g. "guider_input_fields", it is likely to be a list of parameters
+ # if it is a kwargs type, e.g. "denoiser_input_fields", it is likely to be a list of parameters
# we need to first find out which inputs are and loop through them.
intermediate_kwargs = state.get_by_kwargs(input_param.kwargs_type)
for param_name, current_value in intermediate_kwargs.items():
@@ -1454,6 +1445,7 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
config_name = "modular_model_index.json"
hf_device_map = None
+ default_blocks_name = None
# YiYi TODO: add warning for passing multiple ComponentSpec/ConfigSpec with the same name
def __init__(
@@ -1514,7 +1506,7 @@ def __init__(
`_blocks_class_name` in the config dict
"""
if blocks is None:
- blocks_class_name = MODULAR_PIPELINE_BLOCKS_MAPPING.get(self.__class__.__name__)
+ blocks_class_name = self.default_blocks_name
if blocks_class_name is not None:
diffusers_module = importlib.import_module("diffusers")
blocks_class = getattr(diffusers_module, blocks_class_name)
diff --git a/src/diffusers/modular_pipelines/node_utils.py b/src/diffusers/modular_pipelines/node_utils.py
deleted file mode 100644
index 5db860c7887d..000000000000
--- a/src/diffusers/modular_pipelines/node_utils.py
+++ /dev/null
@@ -1,665 +0,0 @@
-import json
-import logging
-import os
-from pathlib import Path
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import PIL
-import torch
-
-from ..configuration_utils import ConfigMixin
-from ..image_processor import PipelineImageInput
-from .modular_pipeline import ModularPipelineBlocks, SequentialPipelineBlocks
-from .modular_pipeline_utils import InputParam
-
-
-logger = logging.getLogger(__name__)
-
-# YiYi Notes: this is actually for SDXL, put it here for now
-SDXL_INPUTS_SCHEMA = {
- "prompt": InputParam(
- "prompt", type_hint=Union[str, List[str]], description="The prompt or prompts to guide the image generation"
- ),
- "prompt_2": InputParam(
- "prompt_2",
- type_hint=Union[str, List[str]],
- description="The prompt or prompts to be sent to the tokenizer_2 and text_encoder_2",
- ),
- "negative_prompt": InputParam(
- "negative_prompt",
- type_hint=Union[str, List[str]],
- description="The prompt or prompts not to guide the image generation",
- ),
- "negative_prompt_2": InputParam(
- "negative_prompt_2",
- type_hint=Union[str, List[str]],
- description="The negative prompt or prompts for text_encoder_2",
- ),
- "cross_attention_kwargs": InputParam(
- "cross_attention_kwargs",
- type_hint=Optional[dict],
- description="Kwargs dictionary passed to the AttentionProcessor",
- ),
- "clip_skip": InputParam(
- "clip_skip", type_hint=Optional[int], description="Number of layers to skip in CLIP text encoder"
- ),
- "image": InputParam(
- "image",
- type_hint=PipelineImageInput,
- required=True,
- description="The image(s) to modify for img2img or inpainting",
- ),
- "mask_image": InputParam(
- "mask_image",
- type_hint=PipelineImageInput,
- required=True,
- description="Mask image for inpainting, white pixels will be repainted",
- ),
- "generator": InputParam(
- "generator",
- type_hint=Optional[Union[torch.Generator, List[torch.Generator]]],
- description="Generator(s) for deterministic generation",
- ),
- "height": InputParam("height", type_hint=Optional[int], description="Height in pixels of the generated image"),
- "width": InputParam("width", type_hint=Optional[int], description="Width in pixels of the generated image"),
- "num_images_per_prompt": InputParam(
- "num_images_per_prompt", type_hint=int, default=1, description="Number of images to generate per prompt"
- ),
- "num_inference_steps": InputParam(
- "num_inference_steps", type_hint=int, default=50, description="Number of denoising steps"
- ),
- "timesteps": InputParam(
- "timesteps", type_hint=Optional[torch.Tensor], description="Custom timesteps for the denoising process"
- ),
- "sigmas": InputParam(
- "sigmas", type_hint=Optional[torch.Tensor], description="Custom sigmas for the denoising process"
- ),
- "denoising_end": InputParam(
- "denoising_end",
- type_hint=Optional[float],
- description="Fraction of denoising process to complete before termination",
- ),
- # YiYi Notes: img2img defaults to 0.3, inpainting defaults to 0.9999
- "strength": InputParam(
- "strength", type_hint=float, default=0.3, description="How much to transform the reference image"
- ),
- "denoising_start": InputParam(
- "denoising_start", type_hint=Optional[float], description="Starting point of the denoising process"
- ),
- "latents": InputParam(
- "latents", type_hint=Optional[torch.Tensor], description="Pre-generated noisy latents for image generation"
- ),
- "padding_mask_crop": InputParam(
- "padding_mask_crop",
- type_hint=Optional[Tuple[int, int]],
- description="Size of margin in crop for image and mask",
- ),
- "original_size": InputParam(
- "original_size",
- type_hint=Optional[Tuple[int, int]],
- description="Original size of the image for SDXL's micro-conditioning",
- ),
- "target_size": InputParam(
- "target_size", type_hint=Optional[Tuple[int, int]], description="Target size for SDXL's micro-conditioning"
- ),
- "negative_original_size": InputParam(
- "negative_original_size",
- type_hint=Optional[Tuple[int, int]],
- description="Negative conditioning based on image resolution",
- ),
- "negative_target_size": InputParam(
- "negative_target_size",
- type_hint=Optional[Tuple[int, int]],
- description="Negative conditioning based on target resolution",
- ),
- "crops_coords_top_left": InputParam(
- "crops_coords_top_left",
- type_hint=Tuple[int, int],
- default=(0, 0),
- description="Top-left coordinates for SDXL's micro-conditioning",
- ),
- "negative_crops_coords_top_left": InputParam(
- "negative_crops_coords_top_left",
- type_hint=Tuple[int, int],
- default=(0, 0),
- description="Negative conditioning crop coordinates",
- ),
- "aesthetic_score": InputParam(
- "aesthetic_score", type_hint=float, default=6.0, description="Simulates aesthetic score of generated image"
- ),
- "negative_aesthetic_score": InputParam(
- "negative_aesthetic_score", type_hint=float, default=2.0, description="Simulates negative aesthetic score"
- ),
- "eta": InputParam("eta", type_hint=float, default=0.0, description="Parameter η in the DDIM paper"),
- "output_type": InputParam(
- "output_type", type_hint=str, default="pil", description="Output format (pil/tensor/np.array)"
- ),
- "ip_adapter_image": InputParam(
- "ip_adapter_image",
- type_hint=PipelineImageInput,
- required=True,
- description="Image(s) to be used as IP adapter",
- ),
- "control_image": InputParam(
- "control_image", type_hint=PipelineImageInput, required=True, description="ControlNet input condition"
- ),
- "control_guidance_start": InputParam(
- "control_guidance_start",
- type_hint=Union[float, List[float]],
- default=0.0,
- description="When ControlNet starts applying",
- ),
- "control_guidance_end": InputParam(
- "control_guidance_end",
- type_hint=Union[float, List[float]],
- default=1.0,
- description="When ControlNet stops applying",
- ),
- "controlnet_conditioning_scale": InputParam(
- "controlnet_conditioning_scale",
- type_hint=Union[float, List[float]],
- default=1.0,
- description="Scale factor for ControlNet outputs",
- ),
- "guess_mode": InputParam(
- "guess_mode",
- type_hint=bool,
- default=False,
- description="Enables ControlNet encoder to recognize input without prompts",
- ),
- "control_mode": InputParam(
- "control_mode", type_hint=List[int], required=True, description="Control mode for union controlnet"
- ),
-}
-
-SDXL_INTERMEDIATE_INPUTS_SCHEMA = {
- "prompt_embeds": InputParam(
- "prompt_embeds",
- type_hint=torch.Tensor,
- required=True,
- description="Text embeddings used to guide image generation",
- ),
- "negative_prompt_embeds": InputParam(
- "negative_prompt_embeds", type_hint=torch.Tensor, description="Negative text embeddings"
- ),
- "pooled_prompt_embeds": InputParam(
- "pooled_prompt_embeds", type_hint=torch.Tensor, required=True, description="Pooled text embeddings"
- ),
- "negative_pooled_prompt_embeds": InputParam(
- "negative_pooled_prompt_embeds", type_hint=torch.Tensor, description="Negative pooled text embeddings"
- ),
- "batch_size": InputParam("batch_size", type_hint=int, required=True, description="Number of prompts"),
- "dtype": InputParam("dtype", type_hint=torch.dtype, description="Data type of model tensor inputs"),
- "preprocess_kwargs": InputParam(
- "preprocess_kwargs", type_hint=Optional[dict], description="Kwargs for ImageProcessor"
- ),
- "latents": InputParam(
- "latents", type_hint=torch.Tensor, required=True, description="Initial latents for denoising process"
- ),
- "timesteps": InputParam("timesteps", type_hint=torch.Tensor, required=True, description="Timesteps for inference"),
- "num_inference_steps": InputParam(
- "num_inference_steps", type_hint=int, required=True, description="Number of denoising steps"
- ),
- "latent_timestep": InputParam(
- "latent_timestep", type_hint=torch.Tensor, required=True, description="Initial noise level timestep"
- ),
- "image_latents": InputParam(
- "image_latents", type_hint=torch.Tensor, required=True, description="Latents representing reference image"
- ),
- "mask": InputParam("mask", type_hint=torch.Tensor, required=True, description="Mask for inpainting"),
- "masked_image_latents": InputParam(
- "masked_image_latents", type_hint=torch.Tensor, description="Masked image latents for inpainting"
- ),
- "add_time_ids": InputParam(
- "add_time_ids", type_hint=torch.Tensor, required=True, description="Time ids for conditioning"
- ),
- "negative_add_time_ids": InputParam(
- "negative_add_time_ids", type_hint=torch.Tensor, description="Negative time ids"
- ),
- "timestep_cond": InputParam("timestep_cond", type_hint=torch.Tensor, description="Timestep conditioning for LCM"),
- "noise": InputParam("noise", type_hint=torch.Tensor, description="Noise added to image latents"),
- "crops_coords": InputParam("crops_coords", type_hint=Optional[Tuple[int]], description="Crop coordinates"),
- "ip_adapter_embeds": InputParam(
- "ip_adapter_embeds", type_hint=List[torch.Tensor], description="Image embeddings for IP-Adapter"
- ),
- "negative_ip_adapter_embeds": InputParam(
- "negative_ip_adapter_embeds",
- type_hint=List[torch.Tensor],
- description="Negative image embeddings for IP-Adapter",
- ),
- "images": InputParam(
- "images",
- type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]],
- required=True,
- description="Generated images",
- ),
-}
-
-SDXL_PARAM_SCHEMA = {**SDXL_INPUTS_SCHEMA, **SDXL_INTERMEDIATE_INPUTS_SCHEMA}
-
-
-DEFAULT_PARAM_MAPS = {
- "prompt": {
- "label": "Prompt",
- "type": "string",
- "default": "a bear sitting in a chair drinking a milkshake",
- "display": "textarea",
- },
- "negative_prompt": {
- "label": "Negative Prompt",
- "type": "string",
- "default": "deformed, ugly, wrong proportion, low res, bad anatomy, worst quality, low quality",
- "display": "textarea",
- },
- "num_inference_steps": {
- "label": "Steps",
- "type": "int",
- "default": 25,
- "min": 1,
- "max": 1000,
- },
- "seed": {
- "label": "Seed",
- "type": "int",
- "default": 0,
- "min": 0,
- "display": "random",
- },
- "width": {
- "label": "Width",
- "type": "int",
- "display": "text",
- "default": 1024,
- "min": 8,
- "max": 8192,
- "step": 8,
- "group": "dimensions",
- },
- "height": {
- "label": "Height",
- "type": "int",
- "display": "text",
- "default": 1024,
- "min": 8,
- "max": 8192,
- "step": 8,
- "group": "dimensions",
- },
- "images": {
- "label": "Images",
- "type": "image",
- "display": "output",
- },
- "image": {
- "label": "Image",
- "type": "image",
- "display": "input",
- },
-}
-
-DEFAULT_TYPE_MAPS = {
- "int": {
- "type": "int",
- "default": 0,
- "min": 0,
- },
- "float": {
- "type": "float",
- "default": 0.0,
- "min": 0.0,
- },
- "str": {
- "type": "string",
- "default": "",
- },
- "bool": {
- "type": "boolean",
- "default": False,
- },
- "image": {
- "type": "image",
- },
-}
-
-DEFAULT_MODEL_KEYS = ["unet", "vae", "text_encoder", "tokenizer", "controlnet", "transformer", "image_encoder"]
-DEFAULT_CATEGORY = "Modular Diffusers"
-DEFAULT_EXCLUDE_MODEL_KEYS = ["processor", "feature_extractor", "safety_checker"]
-DEFAULT_PARAMS_GROUPS_KEYS = {
- "text_encoders": ["text_encoder", "tokenizer"],
- "ip_adapter_embeds": ["ip_adapter_embeds"],
- "prompt_embeddings": ["prompt_embeds"],
-}
-
-
-def get_group_name(name, group_params_keys=DEFAULT_PARAMS_GROUPS_KEYS):
- """
- Get the group name for a given parameter name, if not part of a group, return None e.g. "prompt_embeds" ->
- "text_embeds", "text_encoder" -> "text_encoders", "prompt" -> None
- """
- if name is None:
- return None
- for group_name, group_keys in group_params_keys.items():
- for group_key in group_keys:
- if group_key in name:
- return group_name
- return None
-
-
-class ModularNode(ConfigMixin):
- """
- A ModularNode is a base class to build UI nodes using diffusers. Currently only supports Mellon. It is a wrapper
- around a ModularPipelineBlocks object.
-
-
-
- This is an experimental feature and is likely to change in the future.
-
-
- """
-
- config_name = "node_config.json"
-
- @classmethod
- def from_pretrained(
- cls,
- pretrained_model_name_or_path: str,
- trust_remote_code: Optional[bool] = None,
- **kwargs,
- ):
- blocks = ModularPipelineBlocks.from_pretrained(
- pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
- )
- return cls(blocks, **kwargs)
-
- def __init__(self, blocks, category=DEFAULT_CATEGORY, label=None, **kwargs):
- self.blocks = blocks
-
- if label is None:
- label = self.blocks.__class__.__name__
- # blocks param name -> mellon param name
- self.name_mapping = {}
-
- input_params = {}
- # pass or create a default param dict for each input
- # e.g. for prompt,
- # prompt = {
- # "name": "text_input", # the name of the input in node definition, could be different from the input name in diffusers
- # "label": "Prompt",
- # "type": "string",
- # "default": "a bear sitting in a chair drinking a milkshake",
- # "display": "textarea"}
- # if type is not specified, it'll be a "custom" param of its own type
- # e.g. you can pass ModularNode(scheduler = {name :"scheduler"})
- # it will get this spec in node definition {"scheduler": {"label": "Scheduler", "type": "scheduler", "display": "input"}}
- # name can be a dict, in that case, it is part of a "dict" input in mellon nodes, e.g. text_encoder= {name: {"text_encoders": "text_encoder"}}
- inputs = self.blocks.inputs + self.blocks.intermediate_inputs
- for inp in inputs:
- param = kwargs.pop(inp.name, None)
- if param:
- # user can pass a param dict for all inputs, e.g. ModularNode(prompt = {...})
- input_params[inp.name] = param
- mellon_name = param.pop("name", inp.name)
- if mellon_name != inp.name:
- self.name_mapping[inp.name] = mellon_name
- continue
-
- if inp.name not in DEFAULT_PARAM_MAPS and not inp.required and not get_group_name(inp.name):
- continue
-
- if inp.name in DEFAULT_PARAM_MAPS:
- # first check if it's in the default param map, if so, directly use that
- param = DEFAULT_PARAM_MAPS[inp.name].copy()
- elif get_group_name(inp.name):
- param = get_group_name(inp.name)
- if inp.name not in self.name_mapping:
- self.name_mapping[inp.name] = param
- else:
- # if not, check if it's in the SDXL input schema, if so,
- # 1. use the type hint to determine the type
- # 2. use the default param dict for the type e.g. if "steps" is a "int" type, {"steps": {"type": "int", "default": 0, "min": 0}}
- if inp.type_hint is not None:
- type_str = str(inp.type_hint).lower()
- else:
- inp_spec = SDXL_PARAM_SCHEMA.get(inp.name, None)
- type_str = str(inp_spec.type_hint).lower() if inp_spec else ""
- for type_key, type_param in DEFAULT_TYPE_MAPS.items():
- if type_key in type_str:
- param = type_param.copy()
- param["label"] = inp.name
- param["display"] = "input"
- break
- else:
- param = inp.name
- # add the param dict to the inp_params dict
- input_params[inp.name] = param
-
- component_params = {}
- for comp in self.blocks.expected_components:
- param = kwargs.pop(comp.name, None)
- if param:
- component_params[comp.name] = param
- mellon_name = param.pop("name", comp.name)
- if mellon_name != comp.name:
- self.name_mapping[comp.name] = mellon_name
- continue
-
- to_exclude = False
- for exclude_key in DEFAULT_EXCLUDE_MODEL_KEYS:
- if exclude_key in comp.name:
- to_exclude = True
- break
- if to_exclude:
- continue
-
- if get_group_name(comp.name):
- param = get_group_name(comp.name)
- if comp.name not in self.name_mapping:
- self.name_mapping[comp.name] = param
- elif comp.name in DEFAULT_MODEL_KEYS:
- param = {"label": comp.name, "type": "diffusers_auto_model", "display": "input"}
- else:
- param = comp.name
- # add the param dict to the model_params dict
- component_params[comp.name] = param
-
- output_params = {}
- if isinstance(self.blocks, SequentialPipelineBlocks):
- last_block_name = list(self.blocks.sub_blocks.keys())[-1]
- outputs = self.blocks.sub_blocks[last_block_name].intermediate_outputs
- else:
- outputs = self.blocks.intermediate_outputs
-
- for out in outputs:
- param = kwargs.pop(out.name, None)
- if param:
- output_params[out.name] = param
- mellon_name = param.pop("name", out.name)
- if mellon_name != out.name:
- self.name_mapping[out.name] = mellon_name
- continue
-
- if out.name in DEFAULT_PARAM_MAPS:
- param = DEFAULT_PARAM_MAPS[out.name].copy()
- param["display"] = "output"
- else:
- group_name = get_group_name(out.name)
- if group_name:
- param = group_name
- if out.name not in self.name_mapping:
- self.name_mapping[out.name] = param
- else:
- param = out.name
- # add the param dict to the outputs dict
- output_params[out.name] = param
-
- if len(kwargs) > 0:
- logger.warning(f"Unused kwargs: {kwargs}")
-
- register_dict = {
- "category": category,
- "label": label,
- "input_params": input_params,
- "component_params": component_params,
- "output_params": output_params,
- "name_mapping": self.name_mapping,
- }
- self.register_to_config(**register_dict)
-
- def setup(self, components_manager, collection=None):
- self.pipeline = self.blocks.init_pipeline(components_manager=components_manager, collection=collection)
- self._components_manager = components_manager
-
- @property
- def mellon_config(self):
- return self._convert_to_mellon_config()
-
- def _convert_to_mellon_config(self):
- node = {}
- node["label"] = self.config.label
- node["category"] = self.config.category
-
- node_param = {}
- for inp_name, inp_param in self.config.input_params.items():
- if inp_name in self.name_mapping:
- mellon_name = self.name_mapping[inp_name]
- else:
- mellon_name = inp_name
- if isinstance(inp_param, str):
- param = {
- "label": inp_param,
- "type": inp_param,
- "display": "input",
- }
- else:
- param = inp_param
-
- if mellon_name not in node_param:
- node_param[mellon_name] = param
- else:
- logger.debug(f"Input param {mellon_name} already exists in node_param, skipping {inp_name}")
-
- for comp_name, comp_param in self.config.component_params.items():
- if comp_name in self.name_mapping:
- mellon_name = self.name_mapping[comp_name]
- else:
- mellon_name = comp_name
- if isinstance(comp_param, str):
- param = {
- "label": comp_param,
- "type": comp_param,
- "display": "input",
- }
- else:
- param = comp_param
-
- if mellon_name not in node_param:
- node_param[mellon_name] = param
- else:
- logger.debug(f"Component param {comp_param} already exists in node_param, skipping {comp_name}")
-
- for out_name, out_param in self.config.output_params.items():
- if out_name in self.name_mapping:
- mellon_name = self.name_mapping[out_name]
- else:
- mellon_name = out_name
- if isinstance(out_param, str):
- param = {
- "label": out_param,
- "type": out_param,
- "display": "output",
- }
- else:
- param = out_param
-
- if mellon_name not in node_param:
- node_param[mellon_name] = param
- else:
- logger.debug(f"Output param {out_param} already exists in node_param, skipping {out_name}")
- node["params"] = node_param
- return node
-
- def save_mellon_config(self, file_path):
- """
- Save the Mellon configuration to a JSON file.
-
- Args:
- file_path (str or Path): Path where the JSON file will be saved
-
- Returns:
- Path: Path to the saved config file
- """
- file_path = Path(file_path)
-
- # Create directory if it doesn't exist
- os.makedirs(file_path.parent, exist_ok=True)
-
- # Create a combined dictionary with module definition and name mapping
- config = {"module": self.mellon_config, "name_mapping": self.name_mapping}
-
- # Save the config to file
- with open(file_path, "w", encoding="utf-8") as f:
- json.dump(config, f, indent=2)
-
- logger.info(f"Mellon config and name mapping saved to {file_path}")
-
- return file_path
-
- @classmethod
- def load_mellon_config(cls, file_path):
- """
- Load a Mellon configuration from a JSON file.
-
- Args:
- file_path (str or Path): Path to the JSON file containing Mellon config
-
- Returns:
- dict: The loaded combined configuration containing 'module' and 'name_mapping'
- """
- file_path = Path(file_path)
-
- if not file_path.exists():
- raise FileNotFoundError(f"Config file not found: {file_path}")
-
- with open(file_path, "r", encoding="utf-8") as f:
- config = json.load(f)
-
- logger.info(f"Mellon config loaded from {file_path}")
-
- return config
-
- def process_inputs(self, **kwargs):
- params_components = {}
- for comp_name, comp_param in self.config.component_params.items():
- logger.debug(f"component: {comp_name}")
- mellon_comp_name = self.name_mapping.get(comp_name, comp_name)
- if mellon_comp_name in kwargs:
- if isinstance(kwargs[mellon_comp_name], dict) and comp_name in kwargs[mellon_comp_name]:
- comp = kwargs[mellon_comp_name].pop(comp_name)
- else:
- comp = kwargs.pop(mellon_comp_name)
- if comp:
- params_components[comp_name] = self._components_manager.get_one(comp["model_id"])
-
- params_run = {}
- for inp_name, inp_param in self.config.input_params.items():
- logger.debug(f"input: {inp_name}")
- mellon_inp_name = self.name_mapping.get(inp_name, inp_name)
- if mellon_inp_name in kwargs:
- if isinstance(kwargs[mellon_inp_name], dict) and inp_name in kwargs[mellon_inp_name]:
- inp = kwargs[mellon_inp_name].pop(inp_name)
- else:
- inp = kwargs.pop(mellon_inp_name)
- if inp is not None:
- params_run[inp_name] = inp
-
- return_output_names = list(self.config.output_params.keys())
-
- return params_components, params_run, return_output_names
-
- def execute(self, **kwargs):
- params_components, params_run, return_output_names = self.process_inputs(**kwargs)
-
- self.pipeline.update_components(**params_components)
- output = self.pipeline(**params_run, output=return_output_names)
- return output
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index 738a1e5d151d..606236cfe91b 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -577,9 +577,8 @@ def description(self) -> str:
def inputs(self) -> List[InputParam]:
return [
InputParam(name="batch_size", required=True),
- InputParam(
- name="resized_image", required=True, type_hint=torch.Tensor, description="The resized image input"
- ),
+ InputParam(name="image_height", required=True),
+ InputParam(name="image_width", required=True),
InputParam(name="height", required=True),
InputParam(name="width", required=True),
InputParam(name="prompt_embeds_mask"),
@@ -612,10 +611,6 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
block_state = self.get_block_state(state)
# for edit, image size can be different from the target size (height/width)
- image = (
- block_state.resized_image[0] if isinstance(block_state.resized_image, list) else block_state.resized_image
- )
- image_width, image_height = image.size
block_state.img_shapes = [
[
@@ -624,7 +619,11 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
block_state.height // components.vae_scale_factor // 2,
block_state.width // components.vae_scale_factor // 2,
),
- (1, image_height // components.vae_scale_factor // 2, image_width // components.vae_scale_factor // 2),
+ (
+ 1,
+ block_state.image_height // components.vae_scale_factor // 2,
+ block_state.image_width // components.vae_scale_factor // 2,
+ ),
]
] * block_state.batch_size
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index 280fa6a152c4..2ab83a03ee55 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -496,7 +496,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
)
if components.requires_unconditional_embeds:
- negative_prompt = block_state.negative_prompt or ""
+ negative_prompt = block_state.negative_prompt or " "
block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = get_qwen_prompt_embeds_edit(
components.text_encoder,
components.processor,
diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py
index 2b787c823865..2b229c040b89 100644
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
@@ -307,6 +307,13 @@ def inputs(self) -> List[InputParam]:
return inputs
+ @property
+ def intermediate_outputs(self) -> List[OutputParam]:
+ return [
+ OutputParam(name="image_height", type_hint=int, description="The height of the image latents"),
+ OutputParam(name="image_width", type_hint=int, description="The width of the image latents"),
+ ]
+
@property
def expected_components(self) -> List[ComponentSpec]:
return [
@@ -327,6 +334,11 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
block_state.height = block_state.height or height
block_state.width = block_state.width or width
+ if not hasattr(block_state, "image_height"):
+ block_state.image_height = height
+ if not hasattr(block_state, "image_width"):
+ block_state.image_width = width
+
# 2. Patchify the image latent tensor
image_latent_tensor = components.pachifier.pack_latents(image_latent_tensor)
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
index a01c742fcf68..9126766cc202 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
@@ -511,17 +511,42 @@ def description(self):
)
+class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
+ model_name = "qwenimage"
+ block_classes = [
+ QwenImageAutoInputStep,
+ QwenImageOptionalControlNetInputStep,
+ QwenImageAutoBeforeDenoiseStep,
+ QwenImageOptionalControlNetBeforeDenoiseStep,
+ QwenImageAutoDenoiseStep,
+ ]
+ block_names = ["input", "controlnet_input", "before_denoise", "controlnet_before_denoise", "denoise", "decode"]
+
+ @property
+ def description(self):
+ return (
+ "Core step that performs the denoising process. \n"
+ + " - `QwenImageAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
+ + " - `QwenImageOptionalControlNetInputStep` (controlnet_input) prepares the controlnet input.\n"
+ + " - `QwenImageAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+ + " - `QwenImageOptionalControlNetBeforeDenoiseStep` (controlnet_before_denoise) prepares the controlnet input for the denoising step.\n"
+ + " - `QwenImageAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
+ + " - `QwenImageAutoDecodeStep` (decode) decodes the latents into images.\n\n"
+ + "This step support text-to-image, image-to-image, inpainting, and controlnet tasks for QwenImage:\n"
+ + " - for image-to-image generation, you need to provide `image_latents`\n"
+ + " - for inpainting, you need to provide `processed_mask_image` and `image_latents`\n"
+ + " - to run the controlnet workflow, you need to provide `control_image_latents`\n"
+ + " - for text-to-image generation, all you need to provide is prompt embeddings"
+ )
+
+
## 1.10 QwenImage/auto block & presets
AUTO_BLOCKS = InsertableDict(
[
("text_encoder", QwenImageTextEncoderStep()),
("vae_encoder", QwenImageAutoVaeEncoderStep()),
("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()),
- ("input", QwenImageAutoInputStep()),
- ("controlnet_input", QwenImageOptionalControlNetInputStep()),
- ("before_denoise", QwenImageAutoBeforeDenoiseStep()),
- ("controlnet_before_denoise", QwenImageOptionalControlNetBeforeDenoiseStep()),
- ("denoise", QwenImageAutoDenoiseStep()),
+ ("denoise", QwenImageCoreDenoiseStep()),
("decode", QwenImageAutoDecodeStep()),
]
)
@@ -699,7 +724,7 @@ def description(self):
class QwenImageEditAutoInputStep(AutoPipelineBlocks):
block_classes = [QwenImageInpaintInputStep, QwenImageEditInputStep]
block_names = ["edit_inpaint", "edit"]
- block_trigger_inputs = ["processed_mask_image", "image"]
+ block_trigger_inputs = ["processed_mask_image", "image_latents"]
@property
def description(self):
@@ -800,13 +825,34 @@ def description(self):
## 2.7 QwenImage-Edit/auto blocks & presets
+
+class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
+ model_name = "qwenimage-edit"
+ block_classes = [
+ QwenImageEditAutoInputStep,
+ QwenImageEditAutoBeforeDenoiseStep,
+ QwenImageEditAutoDenoiseStep,
+ ]
+ block_names = ["input", "before_denoise", "denoise"]
+
+ @property
+ def description(self):
+ return (
+ "Core step that performs the denoising process. \n"
+ + " - `QwenImageEditAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
+ + " - `QwenImageEditAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+ + " - `QwenImageEditAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n"
+ + "This step support edit (img2img) and edit inpainting workflow for QwenImage Edit:\n"
+ + " - When `processed_mask_image` is provided, it will be used for edit inpainting task.\n"
+ + " - When `image_latents` is provided, it will be used for edit (img2img) task.\n"
+ )
+
+
EDIT_AUTO_BLOCKS = InsertableDict(
[
("text_encoder", QwenImageEditVLEncoderStep()),
("vae_encoder", QwenImageEditAutoVaeEncoderStep()),
- ("input", QwenImageEditAutoInputStep()),
- ("before_denoise", QwenImageEditAutoBeforeDenoiseStep()),
- ("denoise", QwenImageEditAutoDenoiseStep()),
+ ("denoise", QwenImageEditCoreDenoiseStep()),
("decode", QwenImageAutoDecodeStep()),
]
)
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py
index fe9757f41bcc..3248d131590f 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py
@@ -104,6 +104,8 @@ class QwenImageModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin):
"""
+ default_blocks_name = "QwenImageAutoBlocks"
+
@property
def default_height(self):
return self.default_sample_size * self.vae_scale_factor
@@ -158,6 +160,8 @@ class QwenImageEditModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin):
"""
+ default_blocks_name = "QwenImageEditAutoBlocks"
+
# YiYi TODO: qwen edit should not provide default height/width, should be derived from the resized input image (after adjustment) produced by the resize step.
@property
def default_height(self):
diff --git a/src/diffusers/modular_pipelines/qwenimage/node_utils.py b/src/diffusers/modular_pipelines/qwenimage/node_utils.py
new file mode 100644
index 000000000000..3230ece68abc
--- /dev/null
+++ b/src/diffusers/modular_pipelines/qwenimage/node_utils.py
@@ -0,0 +1,95 @@
+# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# mellon nodes
+QwenImage_NODE_TYPES_PARAMS_MAP = {
+ "controlnet": {
+ "inputs": [
+ "control_image",
+ "controlnet_conditioning_scale",
+ "control_guidance_start",
+ "control_guidance_end",
+ "height",
+ "width",
+ ],
+ "model_inputs": [
+ "controlnet",
+ "vae",
+ ],
+ "outputs": [
+ "controlnet_out",
+ ],
+ "block_names": ["controlnet_vae_encoder"],
+ },
+ "denoise": {
+ "inputs": [
+ "embeddings",
+ "width",
+ "height",
+ "seed",
+ "num_inference_steps",
+ "guidance_scale",
+ "image_latents",
+ "strength",
+ "controlnet",
+ ],
+ "model_inputs": [
+ "unet",
+ "guider",
+ "scheduler",
+ ],
+ "outputs": [
+ "latents",
+ "latents_preview",
+ ],
+ "block_names": ["denoise"],
+ },
+ "vae_encoder": {
+ "inputs": [
+ "image",
+ "width",
+ "height",
+ ],
+ "model_inputs": [
+ "vae",
+ ],
+ "outputs": [
+ "image_latents",
+ ],
+ },
+ "text_encoder": {
+ "inputs": [
+ "prompt",
+ "negative_prompt",
+ ],
+ "model_inputs": [
+ "text_encoders",
+ ],
+ "outputs": [
+ "embeddings",
+ ],
+ },
+ "decoder": {
+ "inputs": [
+ "latents",
+ ],
+ "model_inputs": [
+ "vae",
+ ],
+ "outputs": [
+ "images",
+ ],
+ },
+}
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py
index fefa622f1a61..70cbf0c1c78d 100644
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py
@@ -262,37 +262,37 @@ def intermediate_outputs(self) -> List[str]:
OutputParam(
"prompt_embeds",
type_hint=torch.Tensor,
- kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields
+ kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="text embeddings used to guide the image generation",
),
OutputParam(
"negative_prompt_embeds",
type_hint=torch.Tensor,
- kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields
+ kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="negative text embeddings used to guide the image generation",
),
OutputParam(
"pooled_prompt_embeds",
type_hint=torch.Tensor,
- kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields
+ kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="pooled text embeddings used to guide the image generation",
),
OutputParam(
"negative_pooled_prompt_embeds",
type_hint=torch.Tensor,
- kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields
+ kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="negative pooled text embeddings used to guide the image generation",
),
OutputParam(
"ip_adapter_embeds",
type_hint=List[torch.Tensor],
- kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields
+ kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="image embeddings for IP-Adapter",
),
OutputParam(
"negative_ip_adapter_embeds",
type_hint=List[torch.Tensor],
- kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields
+ kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="negative image embeddings for IP-Adapter",
),
]
@@ -1120,13 +1120,13 @@ def intermediate_outputs(self) -> List[OutputParam]:
OutputParam(
"add_time_ids",
type_hint=torch.Tensor,
- kwargs_type="guider_input_fields",
+ kwargs_type="denoiser_input_fields",
description="The time ids to condition the denoising process",
),
OutputParam(
"negative_add_time_ids",
type_hint=torch.Tensor,
- kwargs_type="guider_input_fields",
+ kwargs_type="denoiser_input_fields",
description="The negative time ids to condition the denoising process",
),
OutputParam("timestep_cond", type_hint=torch.Tensor, description="The timestep cond to use for LCM"),
@@ -1331,13 +1331,13 @@ def intermediate_outputs(self) -> List[OutputParam]:
OutputParam(
"add_time_ids",
type_hint=torch.Tensor,
- kwargs_type="guider_input_fields",
+ kwargs_type="denoiser_input_fields",
description="The time ids to condition the denoising process",
),
OutputParam(
"negative_add_time_ids",
type_hint=torch.Tensor,
- kwargs_type="guider_input_fields",
+ kwargs_type="denoiser_input_fields",
description="The negative time ids to condition the denoising process",
),
OutputParam("timestep_cond", type_hint=torch.Tensor, description="The timestep cond to use for LCM"),
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/denoise.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/denoise.py
index a2e142059532..8a8025747332 100644
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/denoise.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/denoise.py
@@ -183,14 +183,14 @@ def inputs(self) -> List[Tuple[str, Any]]:
description="The guidance scale embedding to use for Latent Consistency Models(LCMs). Can be generated in prepare_additional_conditioning step.",
),
InputParam(
- kwargs_type="guider_input_fields",
+ kwargs_type="denoiser_input_fields",
description=(
"All conditional model inputs that need to be prepared with guider. "
"It should contain prompt_embeds/negative_prompt_embeds, "
"add_time_ids/negative_add_time_ids, "
"pooled_prompt_embeds/negative_pooled_prompt_embeds, "
"and ip_adapter_embeds/negative_ip_adapter_embeds (optional)."
- "please add `kwargs_type=guider_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
+ "please add `kwargs_type=denoiser_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
),
),
]
@@ -307,14 +307,14 @@ def inputs(self) -> List[Tuple[str, Any]]:
description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
),
InputParam(
- kwargs_type="guider_input_fields",
+ kwargs_type="denoiser_input_fields",
description=(
"All conditional model inputs that need to be prepared with guider. "
"It should contain prompt_embeds/negative_prompt_embeds, "
"add_time_ids/negative_add_time_ids, "
"pooled_prompt_embeds/negative_pooled_prompt_embeds, "
"and ip_adapter_embeds/negative_ip_adapter_embeds (optional)."
- "please add `kwargs_type=guider_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
+ "please add `kwargs_type=denoiser_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
),
),
InputParam(
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/encoders.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/encoders.py
index 1e8921d363c1..90b254b6f5d4 100644
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/encoders.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/encoders.py
@@ -258,25 +258,25 @@ def intermediate_outputs(self) -> List[OutputParam]:
OutputParam(
"prompt_embeds",
type_hint=torch.Tensor,
- kwargs_type="guider_input_fields",
+ kwargs_type="denoiser_input_fields",
description="text embeddings used to guide the image generation",
),
OutputParam(
"negative_prompt_embeds",
type_hint=torch.Tensor,
- kwargs_type="guider_input_fields",
+ kwargs_type="denoiser_input_fields",
description="negative text embeddings used to guide the image generation",
),
OutputParam(
"pooled_prompt_embeds",
type_hint=torch.Tensor,
- kwargs_type="guider_input_fields",
+ kwargs_type="denoiser_input_fields",
description="pooled text embeddings used to guide the image generation",
),
OutputParam(
"negative_pooled_prompt_embeds",
type_hint=torch.Tensor,
- kwargs_type="guider_input_fields",
+ kwargs_type="denoiser_input_fields",
description="negative pooled text embeddings used to guide the image generation",
),
]
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py
index c9033856bcc0..68b5e33755b5 100644
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py
@@ -82,19 +82,17 @@ def description(self):
# before_denoise: text2img
class StableDiffusionXLBeforeDenoiseStep(SequentialPipelineBlocks):
block_classes = [
- StableDiffusionXLInputStep,
StableDiffusionXLSetTimestepsStep,
StableDiffusionXLPrepareLatentsStep,
StableDiffusionXLPrepareAdditionalConditioningStep,
]
- block_names = ["input", "set_timesteps", "prepare_latents", "prepare_add_cond"]
+ block_names = ["set_timesteps", "prepare_latents", "prepare_add_cond"]
@property
def description(self):
return (
"Before denoise step that prepare the inputs for the denoise step.\n"
+ "This is a sequential pipeline blocks:\n"
- + " - `StableDiffusionXLInputStep` is used to adjust the batch size of the model inputs\n"
+ " - `StableDiffusionXLSetTimestepsStep` is used to set the timesteps\n"
+ " - `StableDiffusionXLPrepareLatentsStep` is used to prepare the latents\n"
+ " - `StableDiffusionXLPrepareAdditionalConditioningStep` is used to prepare the additional conditioning\n"
@@ -104,19 +102,17 @@ def description(self):
# before_denoise: img2img
class StableDiffusionXLImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
block_classes = [
- StableDiffusionXLInputStep,
StableDiffusionXLImg2ImgSetTimestepsStep,
StableDiffusionXLImg2ImgPrepareLatentsStep,
StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep,
]
- block_names = ["input", "set_timesteps", "prepare_latents", "prepare_add_cond"]
+ block_names = ["set_timesteps", "prepare_latents", "prepare_add_cond"]
@property
def description(self):
return (
"Before denoise step that prepare the inputs for the denoise step for img2img task.\n"
+ "This is a sequential pipeline blocks:\n"
- + " - `StableDiffusionXLInputStep` is used to adjust the batch size of the model inputs\n"
+ " - `StableDiffusionXLImg2ImgSetTimestepsStep` is used to set the timesteps\n"
+ " - `StableDiffusionXLImg2ImgPrepareLatentsStep` is used to prepare the latents\n"
+ " - `StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep` is used to prepare the additional conditioning\n"
@@ -126,19 +122,17 @@ def description(self):
# before_denoise: inpainting
class StableDiffusionXLInpaintBeforeDenoiseStep(SequentialPipelineBlocks):
block_classes = [
- StableDiffusionXLInputStep,
StableDiffusionXLImg2ImgSetTimestepsStep,
StableDiffusionXLInpaintPrepareLatentsStep,
StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep,
]
- block_names = ["input", "set_timesteps", "prepare_latents", "prepare_add_cond"]
+ block_names = ["set_timesteps", "prepare_latents", "prepare_add_cond"]
@property
def description(self):
return (
"Before denoise step that prepare the inputs for the denoise step for inpainting task.\n"
+ "This is a sequential pipeline blocks:\n"
- + " - `StableDiffusionXLInputStep` is used to adjust the batch size of the model inputs\n"
+ " - `StableDiffusionXLImg2ImgSetTimestepsStep` is used to set the timesteps\n"
+ " - `StableDiffusionXLInpaintPrepareLatentsStep` is used to prepare the latents\n"
+ " - `StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep` is used to prepare the additional conditioning\n"
@@ -255,25 +249,48 @@ def description(self):
)
+class StableDiffusionXLCoreDenoiseStep(SequentialPipelineBlocks):
+ block_classes = [
+ StableDiffusionXLInputStep,
+ StableDiffusionXLAutoBeforeDenoiseStep,
+ StableDiffusionXLAutoControlNetInputStep,
+ StableDiffusionXLAutoDenoiseStep,
+ ]
+ block_names = ["input", "before_denoise", "controlnet_input", "denoise"]
+
+ @property
+ def description(self):
+ return (
+ "Core step that performs the denoising process. \n"
+ + " - `StableDiffusionXLInputStep` (input) standardizes the inputs for the denoising step.\n"
+ + " - `StableDiffusionXLAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+ + " - `StableDiffusionXLAutoControlNetInputStep` (controlnet_input) prepares the controlnet input.\n"
+ + " - `StableDiffusionXLAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n"
+ + "This step support text-to-image, image-to-image, inpainting, with or without controlnet/controlnet_union/ip_adapter for Stable Diffusion XL:\n"
+ + "- for image-to-image generation, you need to provide `image_latents`\n"
+ + "- for inpainting, you need to provide `mask_image` and `image_latents`\n"
+ + "- to run the controlnet workflow, you need to provide `control_image`\n"
+ + "- to run the controlnet_union workflow, you need to provide `control_image` and `control_mode`\n"
+ + "- to run the ip_adapter workflow, you need to load ip_adapter into your unet and provide `ip_adapter_embeds`\n"
+ + "- for text-to-image generation, all you need to provide is prompt embeddings\n"
+ )
+
+
# ip-adapter, controlnet, text2img, img2img, inpainting
class StableDiffusionXLAutoBlocks(SequentialPipelineBlocks):
block_classes = [
StableDiffusionXLTextEncoderStep,
StableDiffusionXLAutoIPAdapterStep,
StableDiffusionXLAutoVaeEncoderStep,
- StableDiffusionXLAutoBeforeDenoiseStep,
- StableDiffusionXLAutoControlNetInputStep,
- StableDiffusionXLAutoDenoiseStep,
+ StableDiffusionXLCoreDenoiseStep,
StableDiffusionXLAutoDecodeStep,
]
block_names = [
"text_encoder",
"ip_adapter",
- "image_encoder",
- "before_denoise",
- "controlnet_input",
+ "vae_encoder",
"denoise",
- "decoder",
+ "decode",
]
@property
@@ -321,7 +338,7 @@ def description(self):
IMAGE2IMAGE_BLOCKS = InsertableDict(
[
("text_encoder", StableDiffusionXLTextEncoderStep),
- ("image_encoder", StableDiffusionXLVaeEncoderStep),
+ ("vae_encoder", StableDiffusionXLVaeEncoderStep),
("input", StableDiffusionXLInputStep),
("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep),
("prepare_latents", StableDiffusionXLImg2ImgPrepareLatentsStep),
@@ -334,7 +351,7 @@ def description(self):
INPAINT_BLOCKS = InsertableDict(
[
("text_encoder", StableDiffusionXLTextEncoderStep),
- ("image_encoder", StableDiffusionXLInpaintVaeEncoderStep),
+ ("vae_encoder", StableDiffusionXLInpaintVaeEncoderStep),
("input", StableDiffusionXLInputStep),
("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep),
("prepare_latents", StableDiffusionXLInpaintPrepareLatentsStep),
@@ -361,10 +378,8 @@ def description(self):
[
("text_encoder", StableDiffusionXLTextEncoderStep),
("ip_adapter", StableDiffusionXLAutoIPAdapterStep),
- ("image_encoder", StableDiffusionXLAutoVaeEncoderStep),
- ("before_denoise", StableDiffusionXLAutoBeforeDenoiseStep),
- ("controlnet_input", StableDiffusionXLAutoControlNetInputStep),
- ("denoise", StableDiffusionXLAutoDenoiseStep),
+ ("vae_encoder", StableDiffusionXLAutoVaeEncoderStep),
+ ("denoise", StableDiffusionXLCoreDenoiseStep),
("decode", StableDiffusionXLAutoDecodeStep),
]
)
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py
index e84f5cad1ab4..29a717f72e59 100644
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py
@@ -54,6 +54,8 @@ class StableDiffusionXLModularPipeline(
"""
+ default_blocks_name = "StableDiffusionXLAutoBlocks"
+
@property
def default_height(self):
return self.default_sample_size * self.vae_scale_factor
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/node_utils.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/node_utils.py
new file mode 100644
index 000000000000..3e788bf94741
--- /dev/null
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/node_utils.py
@@ -0,0 +1,99 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+SDXL_NODE_TYPES_PARAMS_MAP = {
+ "controlnet": {
+ "inputs": [
+ "control_image",
+ "controlnet_conditioning_scale",
+ "control_guidance_start",
+ "control_guidance_end",
+ "height",
+ "width",
+ ],
+ "model_inputs": [
+ "controlnet",
+ ],
+ "outputs": [
+ "controlnet_out",
+ ],
+ "block_names": [None],
+ },
+ "denoise": {
+ "inputs": [
+ "embeddings",
+ "width",
+ "height",
+ "seed",
+ "num_inference_steps",
+ "guidance_scale",
+ "image_latents",
+ "strength",
+ # custom adapters coming in as inputs
+ "controlnet",
+ # ip_adapter is optional and custom; include if available
+ "ip_adapter",
+ ],
+ "model_inputs": [
+ "unet",
+ "guider",
+ "scheduler",
+ ],
+ "outputs": [
+ "latents",
+ "latents_preview",
+ ],
+ "block_names": ["denoise"],
+ },
+ "vae_encoder": {
+ "inputs": [
+ "image",
+ "width",
+ "height",
+ ],
+ "model_inputs": [
+ "vae",
+ ],
+ "outputs": [
+ "image_latents",
+ ],
+ "block_names": ["vae_encoder"],
+ },
+ "text_encoder": {
+ "inputs": [
+ "prompt",
+ "negative_prompt",
+ ],
+ "model_inputs": [
+ "text_encoders",
+ ],
+ "outputs": [
+ "embeddings",
+ ],
+ "block_names": ["text_encoder"],
+ },
+ "decoder": {
+ "inputs": [
+ "latents",
+ ],
+ "model_inputs": [
+ "vae",
+ ],
+ "outputs": [
+ "images",
+ ],
+ "block_names": ["decode"],
+ },
+}
diff --git a/src/diffusers/modular_pipelines/wan/before_denoise.py b/src/diffusers/modular_pipelines/wan/before_denoise.py
index 2b9889f8778a..d48f678edd59 100644
--- a/src/diffusers/modular_pipelines/wan/before_denoise.py
+++ b/src/diffusers/modular_pipelines/wan/before_denoise.py
@@ -146,13 +146,13 @@ def intermediate_outputs(self) -> List[str]:
OutputParam(
"prompt_embeds",
type_hint=torch.Tensor,
- kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields
+ kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="text embeddings used to guide the image generation",
),
OutputParam(
"negative_prompt_embeds",
type_hint=torch.Tensor,
- kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields
+ kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="negative text embeddings used to guide the image generation",
),
]
diff --git a/src/diffusers/modular_pipelines/wan/denoise.py b/src/diffusers/modular_pipelines/wan/denoise.py
index 5f578609c24f..66c51493bd6a 100644
--- a/src/diffusers/modular_pipelines/wan/denoise.py
+++ b/src/diffusers/modular_pipelines/wan/denoise.py
@@ -79,11 +79,11 @@ def intermediate_inputs(self) -> List[str]:
description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
),
InputParam(
- kwargs_type="guider_input_fields",
+ kwargs_type="denoiser_input_fields",
description=(
"All conditional model inputs that need to be prepared with guider. "
"It should contain prompt_embeds/negative_prompt_embeds. "
- "Please add `kwargs_type=guider_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
+ "Please add `kwargs_type=denoiser_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
),
),
]
diff --git a/src/diffusers/modular_pipelines/wan/encoders.py b/src/diffusers/modular_pipelines/wan/encoders.py
index a0bf76b99b55..cb2fc242383c 100644
--- a/src/diffusers/modular_pipelines/wan/encoders.py
+++ b/src/diffusers/modular_pipelines/wan/encoders.py
@@ -89,13 +89,13 @@ def intermediate_outputs(self) -> List[OutputParam]:
OutputParam(
"prompt_embeds",
type_hint=torch.Tensor,
- kwargs_type="guider_input_fields",
+ kwargs_type="denoiser_input_fields",
description="text embeddings used to guide the image generation",
),
OutputParam(
"negative_prompt_embeds",
type_hint=torch.Tensor,
- kwargs_type="guider_input_fields",
+ kwargs_type="denoiser_input_fields",
description="negative text embeddings used to guide the image generation",
),
]
diff --git a/src/diffusers/modular_pipelines/wan/modular_pipeline.py b/src/diffusers/modular_pipelines/wan/modular_pipeline.py
index 4d86e0d08e59..da4aada43839 100644
--- a/src/diffusers/modular_pipelines/wan/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/wan/modular_pipeline.py
@@ -37,6 +37,8 @@ class WanModularPipeline(
"""
+ default_blocks_name = "WanAutoBlocks"
+
@property
def default_height(self):
return self.default_sample_height * self.vae_scale_factor_spatial