diff --git a/src/diffusers/modular_pipelines/flux/modular_pipeline.py b/src/diffusers/modular_pipelines/flux/modular_pipeline.py index e97445d411e4..7d869041f2a9 100644 --- a/src/diffusers/modular_pipelines/flux/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/flux/modular_pipeline.py @@ -32,6 +32,8 @@ class FluxModularPipeline(ModularPipeline, FluxLoraLoaderMixin, TextualInversion """ + default_blocks_name = "FluxAutoBlocks" + @property def default_height(self): return self.default_sample_size * self.vae_scale_factor diff --git a/src/diffusers/modular_pipelines/mellon_node_utils.py b/src/diffusers/modular_pipelines/mellon_node_utils.py new file mode 100644 index 000000000000..a405aebee221 --- /dev/null +++ b/src/diffusers/modular_pipelines/mellon_node_utils.py @@ -0,0 +1,763 @@ +import json +import logging +import os + +# Simple typed wrapper for parameter overrides +from dataclasses import asdict, dataclass +from typing import Any, Dict, List, Optional, Tuple, Union + +from huggingface_hub import create_repo, hf_hub_download +from huggingface_hub.utils import ( + EntryNotFoundError, + HfHubHTTPError, + RepositoryNotFoundError, + RevisionNotFoundError, + validate_hf_hub_args, +) + +from ..utils import HUGGINGFACE_CO_RESOLVE_ENDPOINT, PushToHubMixin, extract_commit_hash +from .modular_pipeline import ModularPipelineBlocks + + +logger = logging.getLogger(__name__) + + +SUPPORTED_NODE_TYPES = {"controlnet", "vae_encoder", "denoise", "text_encoder", "decoder"} + + +# Mellon Input Parameters (runtime parameters, not models) +MELLON_INPUT_PARAMS = { + # controlnet + "control_image": { + "label": "Control Image", + "type": "image", + "display": "input", + }, + "controlnet_conditioning_scale": { + "label": "Scale", + "type": "float", + "default": 0.5, + "min": 0, + "max": 1, + }, + "control_guidance_end": { + "label": "End", + "type": "float", + "default": 1.0, + "min": 0, + "max": 1, + }, + "control_guidance_start": { + "label": "Start", + "type": "float", + "default": 0.0, + "min": 0, + "max": 1, + }, + "controlnet": { + "label": "Controlnet", + "type": "custom_controlnet", + "display": "input", + }, + "embeddings": { + "label": "Text Embeddings", + "display": "input", + "type": "embeddings", + }, + "image": { + "label": "Image", + "type": "image", + "display": "input", + }, + "negative_prompt": { + "label": "Negative Prompt", + "type": "string", + "default": "", + "display": "textarea", + }, + "prompt": { + "label": "Prompt", + "type": "string", + "default": "", + "display": "textarea", + }, + "guidance_scale": { + "label": "Guidance Scale", + "type": "float", + "display": "slider", + "default": 5, + "min": 1.0, + "max": 30.0, + "step": 0.1, + }, + "height": { + "label": "Height", + "type": "int", + "default": 1024, + "min": 64, + "step": 8, + }, + "image_latents": { + "label": "Image Latents", + "type": "latents", + "display": "input", + "onChange": {False: ["height", "width"], True: ["strength"]}, + }, + "latents": { + "label": "Latents", + "type": "latents", + "display": "input", + }, + "num_inference_steps": { + "label": "Steps", + "type": "int", + "display": "slider", + "default": 25, + "min": 1, + "max": 100, + }, + "seed": { + "label": "Seed", + "type": "int", + "display": "random", + "default": 0, + "min": 0, + "max": 4294967295, + }, + "strength": { + "label": "Strength", + "type": "float", + "default": 0.5, + "min": 0.0, + "max": 1.0, + "step": 0.01, + }, + "width": { + "label": "Width", + "type": "int", + "default": 1024, + "min": 64, + "step": 8, + }, + "ip_adapter": { + "label": "IP Adapter", + "type": "custom_ip_adapter", + "display": "input", + }, +} + +# Mellon Model Parameters (diffusers_auto_model types) +MELLON_MODEL_PARAMS = { + "scheduler": { + "label": "Scheduler", + "display": "input", + "type": "diffusers_auto_model", + }, + "text_encoders": { + "label": "Text Encoders", + "type": "diffusers_auto_models", + "display": "input", + }, + "unet": { + "label": "Unet", + "display": "input", + "type": "diffusers_auto_model", + "onSignal": { + "action": "signal", + "target": "guider", + }, + }, + "guider": { + "label": "Guider", + "display": "input", + "type": "custom_guider", + "onChange": {False: ["guidance_scale"], True: []}, + }, + "vae": { + "label": "VAE", + "display": "input", + "type": "diffusers_auto_model", + }, + "controlnet": { + "label": "Controlnet Model", + "type": "diffusers_auto_model", + "display": "input", + }, +} + +# Mellon Output Parameters (display = "output") +MELLON_OUTPUT_PARAMS = { + "embeddings": { + "label": "Text Embeddings", + "display": "output", + "type": "embeddings", + }, + "images": { + "label": "Images", + "type": "image", + "display": "output", + }, + "image_latents": { + "label": "Image Latents", + "type": "latents", + "display": "output", + }, + "latents": { + "label": "Latents", + "type": "latents", + "display": "output", + }, + "latents_preview": { + "label": "Latents Preview", + "display": "output", + "type": "latent", + }, + "controlnet_out": { + "label": "Controlnet", + "display": "output", + "type": "controlnet", + }, +} + + +# Default param selections per supported node_type +# from MELLON_INPUT_PARAMS / MELLON_MODEL_PARAMS / MELLON_OUTPUT_PARAMS. +NODE_TYPE_PARAMS_MAP = { + "controlnet": { + "inputs": [ + "control_image", + "controlnet_conditioning_scale", + "control_guidance_start", + "control_guidance_end", + "height", + "width", + ], + "model_inputs": [ + "controlnet", + "vae", + ], + "outputs": [ + "controlnet", + ], + "block_names": ["controlnet_vae_encoder"], + }, + "denoise": { + "inputs": [ + "embeddings", + "width", + "height", + "seed", + "num_inference_steps", + "guidance_scale", + "image_latents", + "strength", + # custom adapters coming in as inputs + "controlnet", + # ip_adapter is optional and custom; include if available + "ip_adapter", + ], + "model_inputs": [ + "unet", + "guider", + "scheduler", + ], + "outputs": [ + "latents", + "latents_preview", + ], + "block_names": ["denoise"], + }, + "vae_encoder": { + "inputs": [ + "image", + "width", + "height", + ], + "model_inputs": [ + "vae", + ], + "outputs": [ + "image_latents", + ], + "block_names": ["vae_encoder"], + }, + "text_encoder": { + "inputs": [ + "prompt", + "negative_prompt", + # optional image prompt input supported in embeddings node + "image", + ], + "model_inputs": [ + "text_encoders", + ], + "outputs": [ + "embeddings", + ], + "block_names": ["text_encoder"], + }, + "decoder": { + "inputs": [ + "latents", + ], + "model_inputs": [ + "vae", + ], + "outputs": [ + "images", + ], + "block_names": ["decode"], + }, +} + + +@dataclass(frozen=True) +class MellonParam: + name: str + label: str + type: str + display: Optional[str] = None + default: Any = None + min: Optional[float] = None + max: Optional[float] = None + step: Optional[float] = None + options: Any = None + value: Any = None + fieldOptions: Optional[Dict[str, Any]] = None + onChange: Any = None + onSignal: Any = None + _map_to_input: Any = None # the block input name this parameter maps to + + def to_dict(self) -> Dict[str, Any]: + data = asdict(self) + return {k: v for k, v in data.items() if not k.startswith("_") and v is not None} + + +@dataclass +class MellonNodeConfig(PushToHubMixin): + """ + A MellonNodeConfig is a base class to build Mellon nodes UI with modular diffusers. + + + + This is an experimental feature and is likely to change in the future. + + + """ + + inputs: List[Union[str, MellonParam]] + model_inputs: List[Union[str, MellonParam]] + outputs: List[Union[str, MellonParam]] + blocks_names: list[str] + node_type: str + config_name = "mellon_config.json" + + def __post_init__(self): + if isinstance(self.inputs, list): + self.inputs = self._resolve_params_list(self.inputs, MELLON_INPUT_PARAMS) + if isinstance(self.model_inputs, list): + self.model_inputs = self._resolve_params_list(self.model_inputs, MELLON_MODEL_PARAMS) + if isinstance(self.outputs, list): + self.outputs = self._resolve_params_list(self.outputs, MELLON_OUTPUT_PARAMS) + + @staticmethod + def _resolve_params_list( + params: List[Union[str, MellonParam]], default_map: Dict[str, Dict[str, Any]] + ) -> Dict[str, Dict[str, Any]]: + def _resolve_param( + param: Union[str, MellonParam], default_params_map: Dict[str, Dict[str, Any]] + ) -> Tuple[str, Dict[str, Any]]: + if isinstance(param, str): + if param not in default_params_map: + raise ValueError(f"Unknown param '{param}', please define a `MellonParam` object instead") + return param, default_params_map[param].copy() + elif isinstance(param, MellonParam): + param_dict = param.to_dict() + param_name = param_dict.pop("name") + return param_name, param_dict + else: + raise ValueError( + f"Unknown param type '{type(param)}', please use a string or a `MellonParam` object instead" + ) + + resolved = {} + for p in params: + logger.info(f" Resolving param: {p}") + name, cfg = _resolve_param(p, default_map) + if name in resolved: + raise ValueError(f"Duplicate param '{name}'") + resolved[name] = cfg + return resolved + + @classmethod + @validate_hf_hub_args + def load_mellon_config( + cls, + pretrained_model_name_or_path: Union[str, os.PathLike], + return_unused_kwargs=False, + return_commit_hash=False, + **kwargs, + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + r""" + Load a model or scheduler configuration. + + Parameters: + pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*): + Can be either: + + - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on + the Hub. + - A path to a *directory* (for example `./my_model_directory`) containing model weights saved with + [`~ConfigMixin.save_config`]. + + cache_dir (`Union[str, os.PathLike]`, *optional*): + Path to a directory where a downloaded pretrained model configuration is cached if the standard cache + is not used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + output_loading_info(`bool`, *optional*, defaults to `False`): + Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages. + local_files_only (`bool`, *optional*, defaults to `False`): + Whether to only load local model weights and configuration files or not. If set to `True`, the model + won't be downloaded from the Hub. + token (`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from + `diffusers-cli login` (stored in `~/.huggingface`) is used. + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier + allowed by Git. + subfolder (`str`, *optional*, defaults to `""`): + The subfolder location of a model file within a larger model repository on the Hub or locally. + return_unused_kwargs (`bool`, *optional*, defaults to `False): + Whether unused keyword arguments of the config are returned. + return_commit_hash (`bool`, *optional*, defaults to `False): + Whether the `commit_hash` of the loaded configuration are returned. + + Returns: + `dict`: + A dictionary of all the parameters stored in a JSON configuration file. + + """ + cache_dir = kwargs.pop("cache_dir", None) + local_dir = kwargs.pop("local_dir", None) + local_dir_use_symlinks = kwargs.pop("local_dir_use_symlinks", "auto") + force_download = kwargs.pop("force_download", False) + proxies = kwargs.pop("proxies", None) + token = kwargs.pop("token", None) + local_files_only = kwargs.pop("local_files_only", False) + revision = kwargs.pop("revision", None) + + pretrained_model_name_or_path = str(pretrained_model_name_or_path) + + if cls.config_name is None: + raise ValueError( + "`self.config_name` is not defined. Note that one should not load a config from " + "`ConfigMixin`. Please make sure to define `config_name` in a class inheriting from `ConfigMixin`" + ) + if os.path.isfile(pretrained_model_name_or_path): + config_file = pretrained_model_name_or_path + elif os.path.isdir(pretrained_model_name_or_path): + if os.path.isfile(os.path.join(pretrained_model_name_or_path, cls.config_name)): + # Load from a PyTorch checkpoint + config_file = os.path.join(pretrained_model_name_or_path, cls.config_name) + else: + raise EnvironmentError( + f"Error no file named {cls.config_name} found in directory {pretrained_model_name_or_path}." + ) + else: + try: + # Load from URL or cache if already cached + config_file = hf_hub_download( + pretrained_model_name_or_path, + filename=cls.config_name, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + local_dir=local_dir, + local_dir_use_symlinks=local_dir_use_symlinks, + ) + except RepositoryNotFoundError: + raise EnvironmentError( + f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier" + " listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a" + " token having permission to this repo with `token` or log in with `hf auth login`." + ) + except RevisionNotFoundError: + raise EnvironmentError( + f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for" + " this model name. Check the model page at" + f" 'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions." + ) + except EntryNotFoundError: + raise EnvironmentError( + f"{pretrained_model_name_or_path} does not appear to have a file named {cls.config_name}." + ) + except HfHubHTTPError as err: + raise EnvironmentError( + "There was a specific connection error when trying to load" + f" {pretrained_model_name_or_path}:\n{err}" + ) + except ValueError: + raise EnvironmentError( + f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it" + f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a" + f" directory containing a {cls.config_name} file.\nCheckout your internet connection or see how to" + " run the library in offline mode at" + " 'https://huggingface.co/docs/diffusers/installation#offline-mode'." + ) + except EnvironmentError: + raise EnvironmentError( + f"Can't load config for '{pretrained_model_name_or_path}'. If you were trying to load it from " + "'https://huggingface.co/models', make sure you don't have a local directory with the same name. " + f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory " + f"containing a {cls.config_name} file" + ) + try: + with open(config_file, "r", encoding="utf-8") as reader: + text = reader.read() + config_dict = json.loads(text) + + commit_hash = extract_commit_hash(config_file) + except (json.JSONDecodeError, UnicodeDecodeError): + raise EnvironmentError(f"It looks like the config file at '{config_file}' is not a valid JSON file.") + + if not (return_unused_kwargs or return_commit_hash): + return config_dict + + outputs = (config_dict,) + + if return_unused_kwargs: + outputs += (kwargs,) + + if return_commit_hash: + outputs += (commit_hash,) + + return outputs + + def save_mellon_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs): + """ + Save the Mellon node definition to a JSON file. + + Args: + save_directory (`str` or `os.PathLike`): + Directory where the configuration JSON file is saved (will be created if it does not exist). + push_to_hub (`bool`, *optional*, defaults to `False`): + Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the + repository you want to push to with `repo_id` (will default to the name of `save_directory` in your + namespace). + kwargs (`Dict[str, Any]`, *optional*): + Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. + """ + if os.path.isfile(save_directory): + raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") + + os.makedirs(save_directory, exist_ok=True) + + # If we save using the predefined names, we can load using `from_config` + output_config_file = os.path.join(save_directory, self.config_name) + + self.to_json_file(output_config_file) + logger.info(f"Mellon node definition saved in {output_config_file}") + + if push_to_hub: + commit_message = kwargs.pop("commit_message", None) + private = kwargs.pop("private", None) + create_pr = kwargs.pop("create_pr", False) + token = kwargs.pop("token", None) + repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1]) + repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id + subfolder = kwargs.pop("subfolder", None) + + self._upload_folder( + save_directory, + repo_id, + token=token, + commit_message=commit_message, + create_pr=create_pr, + subfolder=subfolder, + ) + + def to_json_file(self, json_file_path: Union[str, os.PathLike]): + """ + Save the Mellon schema dictionary to a JSON file. + + Args: + json_file_path (`str` or `os.PathLike`): + Path to the JSON file to save a configuration instance's parameters. + """ + with open(json_file_path, "w", encoding="utf-8") as writer: + writer.write(self.to_json_string()) + + def to_json_string(self) -> str: + """ + Serializes this instance to a JSON string of the Mellon schema dict. + + Args: + Returns: + `str`: String containing all the attributes that make up this configuration instance in JSON format. + """ + + mellon_dict = self.to_mellon_dict() + return json.dumps(mellon_dict, indent=2, sort_keys=True) + "\n" + + def to_mellon_dict(self) -> Dict[str, Any]: + """Return a JSON-serializable dict focusing on the Mellon schema fields only. + + params is a single flat dict composed as: {**inputs, **model_inputs, **outputs}. + """ + # inputs/model_inputs/outputs are already normalized dicts + merged_params = {} + merged_params.update(self.inputs or {}) + merged_params.update(self.model_inputs or {}) + merged_params.update(self.outputs or {}) + + return { + "node_type": self.node_type, + "blocks_names": self.blocks_names, + "params": merged_params, + } + + @classmethod + def from_mellon_dict(cls, mellon_dict: Dict[str, Any]) -> "MellonNodeConfig": + """Create a config from a Mellon schema dict produced by to_mellon_dict(). + + Splits the flat params dict back into inputs/model_inputs/outputs using the known key spaces from + MELLON_INPUT_PARAMS / MELLON_MODEL_PARAMS / MELLON_OUTPUT_PARAMS. Unknown keys are treated as inputs by + default. + """ + flat_params = mellon_dict.get("params", {}) + + inputs: Dict[str, Any] = {} + model_inputs: Dict[str, Any] = {} + outputs: Dict[str, Any] = {} + + for param_name, param_dict in flat_params.items(): + if param_dict.get("display", "") == "output": + outputs[param_name] = param_dict + elif param_dict.get("type", "") in ("diffusers_auto_model", "diffusers_auto_models"): + model_inputs[param_name] = param_dict + else: + inputs[param_name] = param_dict + + return cls( + inputs=inputs, + model_inputs=model_inputs, + outputs=outputs, + blocks_names=mellon_dict.get("blocks_names", []), + node_type=mellon_dict.get("node_type"), + ) + + # YiYi Notes: not used yet + @classmethod + def from_blocks(cls, blocks: ModularPipelineBlocks, node_type: str) -> "MellonNodeConfig": + """ + Create an instance from a ModularPipeline object. If a preset exists in NODE_TYPE_PARAMS_MAP for the node_type, + use it; otherwise fall back to deriving lists from the pipeline's expected inputs/components/outputs. + """ + if node_type not in NODE_TYPE_PARAMS_MAP: + raise ValueError(f"Node type {node_type} not supported") + + blocks_names = list(blocks.sub_blocks.keys()) + + default_node_config = NODE_TYPE_PARAMS_MAP[node_type] + inputs_list: List[Union[str, MellonParam]] = default_node_config.get("inputs", []) + model_inputs_list: List[Union[str, MellonParam]] = default_node_config.get("model_inputs", []) + outputs_list: List[Union[str, MellonParam]] = default_node_config.get("outputs", []) + + for required_input_name in blocks.required_inputs: + if required_input_name not in inputs_list: + inputs_list.append( + MellonParam( + name=required_input_name, label=required_input_name, type=required_input_name, display="input" + ) + ) + + for component_spec in blocks.expected_components: + if component_spec.name not in model_inputs_list: + model_inputs_list.append( + MellonParam( + name=component_spec.name, + label=component_spec.name, + type="diffusers_auto_model", + display="input", + ) + ) + + return cls( + inputs=inputs_list, + model_inputs=model_inputs_list, + outputs=outputs_list, + blocks_names=blocks_names, + node_type=node_type, + ) + + +# Minimal modular registry for Mellon node configs +class ModularMellonNodeRegistry: + """Registry mapping (pipeline class, blocks_name) -> list of MellonNodeConfig.""" + + def __init__(self): + self._registry = {} + self._initialized = False + + def register(self, pipeline_cls: type, node_params: Dict[str, MellonNodeConfig]): + if not self._initialized: + _initialize_registry(self) + self._registry[pipeline_cls] = node_params + + def get(self, pipeline_cls: type) -> MellonNodeConfig: + if not self._initialized: + _initialize_registry(self) + return self._registry.get(pipeline_cls, None) + + def get_all(self) -> Dict[type, Dict[str, MellonNodeConfig]]: + if not self._initialized: + _initialize_registry(self) + return self._registry + + +def _register_preset_node_types( + pipeline_cls, params_map: Dict[str, Dict[str, Any]], registry: ModularMellonNodeRegistry +): + """Register all node-type presets for a given pipeline class from a params map.""" + node_configs = {} + for node_type, spec in params_map.items(): + node_config = MellonNodeConfig( + inputs=spec.get("inputs", []), + model_inputs=spec.get("model_inputs", []), + outputs=spec.get("outputs", []), + blocks_names=spec.get("block_names", []), + node_type=node_type, + ) + node_configs[node_type] = node_config + registry.register(pipeline_cls, node_configs) + + +def _initialize_registry(registry: ModularMellonNodeRegistry): + """Initialize the registry and register all available pipeline configs.""" + print("Initializing registry") + + registry._initialized = True + + try: + from .qwenimage.modular_pipeline import QwenImageModularPipeline + from .qwenimage.node_utils import QwenImage_NODE_TYPES_PARAMS_MAP + + _register_preset_node_types(QwenImageModularPipeline, QwenImage_NODE_TYPES_PARAMS_MAP, registry) + except Exception: + raise Exception("Failed to register QwenImageModularPipeline") + + try: + from .stable_diffusion_xl.modular_pipeline import StableDiffusionXLModularPipeline + from .stable_diffusion_xl.node_utils import SDXL_NODE_TYPES_PARAMS_MAP + + _register_preset_node_types(StableDiffusionXLModularPipeline, SDXL_NODE_TYPES_PARAMS_MAP, registry) + except Exception: + raise Exception("Failed to register StableDiffusionXLModularPipeline") diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py index 74ffc6234894..206d19f17371 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/modular_pipeline.py @@ -51,6 +51,7 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name +# map regular pipeline to modular pipeline class name MODULAR_PIPELINE_MAPPING = OrderedDict( [ ("stable-diffusion-xl", "StableDiffusionXLModularPipeline"), @@ -61,16 +62,6 @@ ] ) -MODULAR_PIPELINE_BLOCKS_MAPPING = OrderedDict( - [ - ("StableDiffusionXLModularPipeline", "StableDiffusionXLAutoBlocks"), - ("WanModularPipeline", "WanAutoBlocks"), - ("FluxModularPipeline", "FluxAutoBlocks"), - ("QwenImageModularPipeline", "QwenImageAutoBlocks"), - ("QwenImageEditModularPipeline", "QwenImageEditAutoBlocks"), - ] -) - @dataclass class PipelineState: @@ -423,7 +414,7 @@ def set_block_state(self, state: PipelineState, block_state: BlockState): state.set(input_param.name, param, input_param.kwargs_type) elif input_param.kwargs_type: - # if it is a kwargs type, e.g. "guider_input_fields", it is likely to be a list of parameters + # if it is a kwargs type, e.g. "denoiser_input_fields", it is likely to be a list of parameters # we need to first find out which inputs are and loop through them. intermediate_kwargs = state.get_by_kwargs(input_param.kwargs_type) for param_name, current_value in intermediate_kwargs.items(): @@ -1454,6 +1445,7 @@ class ModularPipeline(ConfigMixin, PushToHubMixin): config_name = "modular_model_index.json" hf_device_map = None + default_blocks_name = None # YiYi TODO: add warning for passing multiple ComponentSpec/ConfigSpec with the same name def __init__( @@ -1514,7 +1506,7 @@ def __init__( `_blocks_class_name` in the config dict """ if blocks is None: - blocks_class_name = MODULAR_PIPELINE_BLOCKS_MAPPING.get(self.__class__.__name__) + blocks_class_name = self.default_blocks_name if blocks_class_name is not None: diffusers_module = importlib.import_module("diffusers") blocks_class = getattr(diffusers_module, blocks_class_name) diff --git a/src/diffusers/modular_pipelines/node_utils.py b/src/diffusers/modular_pipelines/node_utils.py deleted file mode 100644 index 5db860c7887d..000000000000 --- a/src/diffusers/modular_pipelines/node_utils.py +++ /dev/null @@ -1,665 +0,0 @@ -import json -import logging -import os -from pathlib import Path -from typing import List, Optional, Tuple, Union - -import numpy as np -import PIL -import torch - -from ..configuration_utils import ConfigMixin -from ..image_processor import PipelineImageInput -from .modular_pipeline import ModularPipelineBlocks, SequentialPipelineBlocks -from .modular_pipeline_utils import InputParam - - -logger = logging.getLogger(__name__) - -# YiYi Notes: this is actually for SDXL, put it here for now -SDXL_INPUTS_SCHEMA = { - "prompt": InputParam( - "prompt", type_hint=Union[str, List[str]], description="The prompt or prompts to guide the image generation" - ), - "prompt_2": InputParam( - "prompt_2", - type_hint=Union[str, List[str]], - description="The prompt or prompts to be sent to the tokenizer_2 and text_encoder_2", - ), - "negative_prompt": InputParam( - "negative_prompt", - type_hint=Union[str, List[str]], - description="The prompt or prompts not to guide the image generation", - ), - "negative_prompt_2": InputParam( - "negative_prompt_2", - type_hint=Union[str, List[str]], - description="The negative prompt or prompts for text_encoder_2", - ), - "cross_attention_kwargs": InputParam( - "cross_attention_kwargs", - type_hint=Optional[dict], - description="Kwargs dictionary passed to the AttentionProcessor", - ), - "clip_skip": InputParam( - "clip_skip", type_hint=Optional[int], description="Number of layers to skip in CLIP text encoder" - ), - "image": InputParam( - "image", - type_hint=PipelineImageInput, - required=True, - description="The image(s) to modify for img2img or inpainting", - ), - "mask_image": InputParam( - "mask_image", - type_hint=PipelineImageInput, - required=True, - description="Mask image for inpainting, white pixels will be repainted", - ), - "generator": InputParam( - "generator", - type_hint=Optional[Union[torch.Generator, List[torch.Generator]]], - description="Generator(s) for deterministic generation", - ), - "height": InputParam("height", type_hint=Optional[int], description="Height in pixels of the generated image"), - "width": InputParam("width", type_hint=Optional[int], description="Width in pixels of the generated image"), - "num_images_per_prompt": InputParam( - "num_images_per_prompt", type_hint=int, default=1, description="Number of images to generate per prompt" - ), - "num_inference_steps": InputParam( - "num_inference_steps", type_hint=int, default=50, description="Number of denoising steps" - ), - "timesteps": InputParam( - "timesteps", type_hint=Optional[torch.Tensor], description="Custom timesteps for the denoising process" - ), - "sigmas": InputParam( - "sigmas", type_hint=Optional[torch.Tensor], description="Custom sigmas for the denoising process" - ), - "denoising_end": InputParam( - "denoising_end", - type_hint=Optional[float], - description="Fraction of denoising process to complete before termination", - ), - # YiYi Notes: img2img defaults to 0.3, inpainting defaults to 0.9999 - "strength": InputParam( - "strength", type_hint=float, default=0.3, description="How much to transform the reference image" - ), - "denoising_start": InputParam( - "denoising_start", type_hint=Optional[float], description="Starting point of the denoising process" - ), - "latents": InputParam( - "latents", type_hint=Optional[torch.Tensor], description="Pre-generated noisy latents for image generation" - ), - "padding_mask_crop": InputParam( - "padding_mask_crop", - type_hint=Optional[Tuple[int, int]], - description="Size of margin in crop for image and mask", - ), - "original_size": InputParam( - "original_size", - type_hint=Optional[Tuple[int, int]], - description="Original size of the image for SDXL's micro-conditioning", - ), - "target_size": InputParam( - "target_size", type_hint=Optional[Tuple[int, int]], description="Target size for SDXL's micro-conditioning" - ), - "negative_original_size": InputParam( - "negative_original_size", - type_hint=Optional[Tuple[int, int]], - description="Negative conditioning based on image resolution", - ), - "negative_target_size": InputParam( - "negative_target_size", - type_hint=Optional[Tuple[int, int]], - description="Negative conditioning based on target resolution", - ), - "crops_coords_top_left": InputParam( - "crops_coords_top_left", - type_hint=Tuple[int, int], - default=(0, 0), - description="Top-left coordinates for SDXL's micro-conditioning", - ), - "negative_crops_coords_top_left": InputParam( - "negative_crops_coords_top_left", - type_hint=Tuple[int, int], - default=(0, 0), - description="Negative conditioning crop coordinates", - ), - "aesthetic_score": InputParam( - "aesthetic_score", type_hint=float, default=6.0, description="Simulates aesthetic score of generated image" - ), - "negative_aesthetic_score": InputParam( - "negative_aesthetic_score", type_hint=float, default=2.0, description="Simulates negative aesthetic score" - ), - "eta": InputParam("eta", type_hint=float, default=0.0, description="Parameter η in the DDIM paper"), - "output_type": InputParam( - "output_type", type_hint=str, default="pil", description="Output format (pil/tensor/np.array)" - ), - "ip_adapter_image": InputParam( - "ip_adapter_image", - type_hint=PipelineImageInput, - required=True, - description="Image(s) to be used as IP adapter", - ), - "control_image": InputParam( - "control_image", type_hint=PipelineImageInput, required=True, description="ControlNet input condition" - ), - "control_guidance_start": InputParam( - "control_guidance_start", - type_hint=Union[float, List[float]], - default=0.0, - description="When ControlNet starts applying", - ), - "control_guidance_end": InputParam( - "control_guidance_end", - type_hint=Union[float, List[float]], - default=1.0, - description="When ControlNet stops applying", - ), - "controlnet_conditioning_scale": InputParam( - "controlnet_conditioning_scale", - type_hint=Union[float, List[float]], - default=1.0, - description="Scale factor for ControlNet outputs", - ), - "guess_mode": InputParam( - "guess_mode", - type_hint=bool, - default=False, - description="Enables ControlNet encoder to recognize input without prompts", - ), - "control_mode": InputParam( - "control_mode", type_hint=List[int], required=True, description="Control mode for union controlnet" - ), -} - -SDXL_INTERMEDIATE_INPUTS_SCHEMA = { - "prompt_embeds": InputParam( - "prompt_embeds", - type_hint=torch.Tensor, - required=True, - description="Text embeddings used to guide image generation", - ), - "negative_prompt_embeds": InputParam( - "negative_prompt_embeds", type_hint=torch.Tensor, description="Negative text embeddings" - ), - "pooled_prompt_embeds": InputParam( - "pooled_prompt_embeds", type_hint=torch.Tensor, required=True, description="Pooled text embeddings" - ), - "negative_pooled_prompt_embeds": InputParam( - "negative_pooled_prompt_embeds", type_hint=torch.Tensor, description="Negative pooled text embeddings" - ), - "batch_size": InputParam("batch_size", type_hint=int, required=True, description="Number of prompts"), - "dtype": InputParam("dtype", type_hint=torch.dtype, description="Data type of model tensor inputs"), - "preprocess_kwargs": InputParam( - "preprocess_kwargs", type_hint=Optional[dict], description="Kwargs for ImageProcessor" - ), - "latents": InputParam( - "latents", type_hint=torch.Tensor, required=True, description="Initial latents for denoising process" - ), - "timesteps": InputParam("timesteps", type_hint=torch.Tensor, required=True, description="Timesteps for inference"), - "num_inference_steps": InputParam( - "num_inference_steps", type_hint=int, required=True, description="Number of denoising steps" - ), - "latent_timestep": InputParam( - "latent_timestep", type_hint=torch.Tensor, required=True, description="Initial noise level timestep" - ), - "image_latents": InputParam( - "image_latents", type_hint=torch.Tensor, required=True, description="Latents representing reference image" - ), - "mask": InputParam("mask", type_hint=torch.Tensor, required=True, description="Mask for inpainting"), - "masked_image_latents": InputParam( - "masked_image_latents", type_hint=torch.Tensor, description="Masked image latents for inpainting" - ), - "add_time_ids": InputParam( - "add_time_ids", type_hint=torch.Tensor, required=True, description="Time ids for conditioning" - ), - "negative_add_time_ids": InputParam( - "negative_add_time_ids", type_hint=torch.Tensor, description="Negative time ids" - ), - "timestep_cond": InputParam("timestep_cond", type_hint=torch.Tensor, description="Timestep conditioning for LCM"), - "noise": InputParam("noise", type_hint=torch.Tensor, description="Noise added to image latents"), - "crops_coords": InputParam("crops_coords", type_hint=Optional[Tuple[int]], description="Crop coordinates"), - "ip_adapter_embeds": InputParam( - "ip_adapter_embeds", type_hint=List[torch.Tensor], description="Image embeddings for IP-Adapter" - ), - "negative_ip_adapter_embeds": InputParam( - "negative_ip_adapter_embeds", - type_hint=List[torch.Tensor], - description="Negative image embeddings for IP-Adapter", - ), - "images": InputParam( - "images", - type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]], - required=True, - description="Generated images", - ), -} - -SDXL_PARAM_SCHEMA = {**SDXL_INPUTS_SCHEMA, **SDXL_INTERMEDIATE_INPUTS_SCHEMA} - - -DEFAULT_PARAM_MAPS = { - "prompt": { - "label": "Prompt", - "type": "string", - "default": "a bear sitting in a chair drinking a milkshake", - "display": "textarea", - }, - "negative_prompt": { - "label": "Negative Prompt", - "type": "string", - "default": "deformed, ugly, wrong proportion, low res, bad anatomy, worst quality, low quality", - "display": "textarea", - }, - "num_inference_steps": { - "label": "Steps", - "type": "int", - "default": 25, - "min": 1, - "max": 1000, - }, - "seed": { - "label": "Seed", - "type": "int", - "default": 0, - "min": 0, - "display": "random", - }, - "width": { - "label": "Width", - "type": "int", - "display": "text", - "default": 1024, - "min": 8, - "max": 8192, - "step": 8, - "group": "dimensions", - }, - "height": { - "label": "Height", - "type": "int", - "display": "text", - "default": 1024, - "min": 8, - "max": 8192, - "step": 8, - "group": "dimensions", - }, - "images": { - "label": "Images", - "type": "image", - "display": "output", - }, - "image": { - "label": "Image", - "type": "image", - "display": "input", - }, -} - -DEFAULT_TYPE_MAPS = { - "int": { - "type": "int", - "default": 0, - "min": 0, - }, - "float": { - "type": "float", - "default": 0.0, - "min": 0.0, - }, - "str": { - "type": "string", - "default": "", - }, - "bool": { - "type": "boolean", - "default": False, - }, - "image": { - "type": "image", - }, -} - -DEFAULT_MODEL_KEYS = ["unet", "vae", "text_encoder", "tokenizer", "controlnet", "transformer", "image_encoder"] -DEFAULT_CATEGORY = "Modular Diffusers" -DEFAULT_EXCLUDE_MODEL_KEYS = ["processor", "feature_extractor", "safety_checker"] -DEFAULT_PARAMS_GROUPS_KEYS = { - "text_encoders": ["text_encoder", "tokenizer"], - "ip_adapter_embeds": ["ip_adapter_embeds"], - "prompt_embeddings": ["prompt_embeds"], -} - - -def get_group_name(name, group_params_keys=DEFAULT_PARAMS_GROUPS_KEYS): - """ - Get the group name for a given parameter name, if not part of a group, return None e.g. "prompt_embeds" -> - "text_embeds", "text_encoder" -> "text_encoders", "prompt" -> None - """ - if name is None: - return None - for group_name, group_keys in group_params_keys.items(): - for group_key in group_keys: - if group_key in name: - return group_name - return None - - -class ModularNode(ConfigMixin): - """ - A ModularNode is a base class to build UI nodes using diffusers. Currently only supports Mellon. It is a wrapper - around a ModularPipelineBlocks object. - - - - This is an experimental feature and is likely to change in the future. - - - """ - - config_name = "node_config.json" - - @classmethod - def from_pretrained( - cls, - pretrained_model_name_or_path: str, - trust_remote_code: Optional[bool] = None, - **kwargs, - ): - blocks = ModularPipelineBlocks.from_pretrained( - pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs - ) - return cls(blocks, **kwargs) - - def __init__(self, blocks, category=DEFAULT_CATEGORY, label=None, **kwargs): - self.blocks = blocks - - if label is None: - label = self.blocks.__class__.__name__ - # blocks param name -> mellon param name - self.name_mapping = {} - - input_params = {} - # pass or create a default param dict for each input - # e.g. for prompt, - # prompt = { - # "name": "text_input", # the name of the input in node definition, could be different from the input name in diffusers - # "label": "Prompt", - # "type": "string", - # "default": "a bear sitting in a chair drinking a milkshake", - # "display": "textarea"} - # if type is not specified, it'll be a "custom" param of its own type - # e.g. you can pass ModularNode(scheduler = {name :"scheduler"}) - # it will get this spec in node definition {"scheduler": {"label": "Scheduler", "type": "scheduler", "display": "input"}} - # name can be a dict, in that case, it is part of a "dict" input in mellon nodes, e.g. text_encoder= {name: {"text_encoders": "text_encoder"}} - inputs = self.blocks.inputs + self.blocks.intermediate_inputs - for inp in inputs: - param = kwargs.pop(inp.name, None) - if param: - # user can pass a param dict for all inputs, e.g. ModularNode(prompt = {...}) - input_params[inp.name] = param - mellon_name = param.pop("name", inp.name) - if mellon_name != inp.name: - self.name_mapping[inp.name] = mellon_name - continue - - if inp.name not in DEFAULT_PARAM_MAPS and not inp.required and not get_group_name(inp.name): - continue - - if inp.name in DEFAULT_PARAM_MAPS: - # first check if it's in the default param map, if so, directly use that - param = DEFAULT_PARAM_MAPS[inp.name].copy() - elif get_group_name(inp.name): - param = get_group_name(inp.name) - if inp.name not in self.name_mapping: - self.name_mapping[inp.name] = param - else: - # if not, check if it's in the SDXL input schema, if so, - # 1. use the type hint to determine the type - # 2. use the default param dict for the type e.g. if "steps" is a "int" type, {"steps": {"type": "int", "default": 0, "min": 0}} - if inp.type_hint is not None: - type_str = str(inp.type_hint).lower() - else: - inp_spec = SDXL_PARAM_SCHEMA.get(inp.name, None) - type_str = str(inp_spec.type_hint).lower() if inp_spec else "" - for type_key, type_param in DEFAULT_TYPE_MAPS.items(): - if type_key in type_str: - param = type_param.copy() - param["label"] = inp.name - param["display"] = "input" - break - else: - param = inp.name - # add the param dict to the inp_params dict - input_params[inp.name] = param - - component_params = {} - for comp in self.blocks.expected_components: - param = kwargs.pop(comp.name, None) - if param: - component_params[comp.name] = param - mellon_name = param.pop("name", comp.name) - if mellon_name != comp.name: - self.name_mapping[comp.name] = mellon_name - continue - - to_exclude = False - for exclude_key in DEFAULT_EXCLUDE_MODEL_KEYS: - if exclude_key in comp.name: - to_exclude = True - break - if to_exclude: - continue - - if get_group_name(comp.name): - param = get_group_name(comp.name) - if comp.name not in self.name_mapping: - self.name_mapping[comp.name] = param - elif comp.name in DEFAULT_MODEL_KEYS: - param = {"label": comp.name, "type": "diffusers_auto_model", "display": "input"} - else: - param = comp.name - # add the param dict to the model_params dict - component_params[comp.name] = param - - output_params = {} - if isinstance(self.blocks, SequentialPipelineBlocks): - last_block_name = list(self.blocks.sub_blocks.keys())[-1] - outputs = self.blocks.sub_blocks[last_block_name].intermediate_outputs - else: - outputs = self.blocks.intermediate_outputs - - for out in outputs: - param = kwargs.pop(out.name, None) - if param: - output_params[out.name] = param - mellon_name = param.pop("name", out.name) - if mellon_name != out.name: - self.name_mapping[out.name] = mellon_name - continue - - if out.name in DEFAULT_PARAM_MAPS: - param = DEFAULT_PARAM_MAPS[out.name].copy() - param["display"] = "output" - else: - group_name = get_group_name(out.name) - if group_name: - param = group_name - if out.name not in self.name_mapping: - self.name_mapping[out.name] = param - else: - param = out.name - # add the param dict to the outputs dict - output_params[out.name] = param - - if len(kwargs) > 0: - logger.warning(f"Unused kwargs: {kwargs}") - - register_dict = { - "category": category, - "label": label, - "input_params": input_params, - "component_params": component_params, - "output_params": output_params, - "name_mapping": self.name_mapping, - } - self.register_to_config(**register_dict) - - def setup(self, components_manager, collection=None): - self.pipeline = self.blocks.init_pipeline(components_manager=components_manager, collection=collection) - self._components_manager = components_manager - - @property - def mellon_config(self): - return self._convert_to_mellon_config() - - def _convert_to_mellon_config(self): - node = {} - node["label"] = self.config.label - node["category"] = self.config.category - - node_param = {} - for inp_name, inp_param in self.config.input_params.items(): - if inp_name in self.name_mapping: - mellon_name = self.name_mapping[inp_name] - else: - mellon_name = inp_name - if isinstance(inp_param, str): - param = { - "label": inp_param, - "type": inp_param, - "display": "input", - } - else: - param = inp_param - - if mellon_name not in node_param: - node_param[mellon_name] = param - else: - logger.debug(f"Input param {mellon_name} already exists in node_param, skipping {inp_name}") - - for comp_name, comp_param in self.config.component_params.items(): - if comp_name in self.name_mapping: - mellon_name = self.name_mapping[comp_name] - else: - mellon_name = comp_name - if isinstance(comp_param, str): - param = { - "label": comp_param, - "type": comp_param, - "display": "input", - } - else: - param = comp_param - - if mellon_name not in node_param: - node_param[mellon_name] = param - else: - logger.debug(f"Component param {comp_param} already exists in node_param, skipping {comp_name}") - - for out_name, out_param in self.config.output_params.items(): - if out_name in self.name_mapping: - mellon_name = self.name_mapping[out_name] - else: - mellon_name = out_name - if isinstance(out_param, str): - param = { - "label": out_param, - "type": out_param, - "display": "output", - } - else: - param = out_param - - if mellon_name not in node_param: - node_param[mellon_name] = param - else: - logger.debug(f"Output param {out_param} already exists in node_param, skipping {out_name}") - node["params"] = node_param - return node - - def save_mellon_config(self, file_path): - """ - Save the Mellon configuration to a JSON file. - - Args: - file_path (str or Path): Path where the JSON file will be saved - - Returns: - Path: Path to the saved config file - """ - file_path = Path(file_path) - - # Create directory if it doesn't exist - os.makedirs(file_path.parent, exist_ok=True) - - # Create a combined dictionary with module definition and name mapping - config = {"module": self.mellon_config, "name_mapping": self.name_mapping} - - # Save the config to file - with open(file_path, "w", encoding="utf-8") as f: - json.dump(config, f, indent=2) - - logger.info(f"Mellon config and name mapping saved to {file_path}") - - return file_path - - @classmethod - def load_mellon_config(cls, file_path): - """ - Load a Mellon configuration from a JSON file. - - Args: - file_path (str or Path): Path to the JSON file containing Mellon config - - Returns: - dict: The loaded combined configuration containing 'module' and 'name_mapping' - """ - file_path = Path(file_path) - - if not file_path.exists(): - raise FileNotFoundError(f"Config file not found: {file_path}") - - with open(file_path, "r", encoding="utf-8") as f: - config = json.load(f) - - logger.info(f"Mellon config loaded from {file_path}") - - return config - - def process_inputs(self, **kwargs): - params_components = {} - for comp_name, comp_param in self.config.component_params.items(): - logger.debug(f"component: {comp_name}") - mellon_comp_name = self.name_mapping.get(comp_name, comp_name) - if mellon_comp_name in kwargs: - if isinstance(kwargs[mellon_comp_name], dict) and comp_name in kwargs[mellon_comp_name]: - comp = kwargs[mellon_comp_name].pop(comp_name) - else: - comp = kwargs.pop(mellon_comp_name) - if comp: - params_components[comp_name] = self._components_manager.get_one(comp["model_id"]) - - params_run = {} - for inp_name, inp_param in self.config.input_params.items(): - logger.debug(f"input: {inp_name}") - mellon_inp_name = self.name_mapping.get(inp_name, inp_name) - if mellon_inp_name in kwargs: - if isinstance(kwargs[mellon_inp_name], dict) and inp_name in kwargs[mellon_inp_name]: - inp = kwargs[mellon_inp_name].pop(inp_name) - else: - inp = kwargs.pop(mellon_inp_name) - if inp is not None: - params_run[inp_name] = inp - - return_output_names = list(self.config.output_params.keys()) - - return params_components, params_run, return_output_names - - def execute(self, **kwargs): - params_components, params_run, return_output_names = self.process_inputs(**kwargs) - - self.pipeline.update_components(**params_components) - output = self.pipeline(**params_run, output=return_output_names) - return output diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py index 738a1e5d151d..606236cfe91b 100644 --- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py @@ -577,9 +577,8 @@ def description(self) -> str: def inputs(self) -> List[InputParam]: return [ InputParam(name="batch_size", required=True), - InputParam( - name="resized_image", required=True, type_hint=torch.Tensor, description="The resized image input" - ), + InputParam(name="image_height", required=True), + InputParam(name="image_width", required=True), InputParam(name="height", required=True), InputParam(name="width", required=True), InputParam(name="prompt_embeds_mask"), @@ -612,10 +611,6 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - block_state = self.get_block_state(state) # for edit, image size can be different from the target size (height/width) - image = ( - block_state.resized_image[0] if isinstance(block_state.resized_image, list) else block_state.resized_image - ) - image_width, image_height = image.size block_state.img_shapes = [ [ @@ -624,7 +619,11 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - block_state.height // components.vae_scale_factor // 2, block_state.width // components.vae_scale_factor // 2, ), - (1, image_height // components.vae_scale_factor // 2, image_width // components.vae_scale_factor // 2), + ( + 1, + block_state.image_height // components.vae_scale_factor // 2, + block_state.image_width // components.vae_scale_factor // 2, + ), ] ] * block_state.batch_size diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py index 280fa6a152c4..2ab83a03ee55 100644 --- a/src/diffusers/modular_pipelines/qwenimage/encoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py @@ -496,7 +496,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): ) if components.requires_unconditional_embeds: - negative_prompt = block_state.negative_prompt or "" + negative_prompt = block_state.negative_prompt or " " block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = get_qwen_prompt_embeds_edit( components.text_encoder, components.processor, diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py index 2b787c823865..2b229c040b89 100644 --- a/src/diffusers/modular_pipelines/qwenimage/inputs.py +++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py @@ -307,6 +307,13 @@ def inputs(self) -> List[InputParam]: return inputs + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam(name="image_height", type_hint=int, description="The height of the image latents"), + OutputParam(name="image_width", type_hint=int, description="The width of the image latents"), + ] + @property def expected_components(self) -> List[ComponentSpec]: return [ @@ -327,6 +334,11 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - block_state.height = block_state.height or height block_state.width = block_state.width or width + if not hasattr(block_state, "image_height"): + block_state.image_height = height + if not hasattr(block_state, "image_width"): + block_state.image_width = width + # 2. Patchify the image latent tensor image_latent_tensor = components.pachifier.pack_latents(image_latent_tensor) diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py index a01c742fcf68..9126766cc202 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py @@ -511,17 +511,42 @@ def description(self): ) +class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = [ + QwenImageAutoInputStep, + QwenImageOptionalControlNetInputStep, + QwenImageAutoBeforeDenoiseStep, + QwenImageOptionalControlNetBeforeDenoiseStep, + QwenImageAutoDenoiseStep, + ] + block_names = ["input", "controlnet_input", "before_denoise", "controlnet_before_denoise", "denoise", "decode"] + + @property + def description(self): + return ( + "Core step that performs the denoising process. \n" + + " - `QwenImageAutoInputStep` (input) standardizes the inputs for the denoising step.\n" + + " - `QwenImageOptionalControlNetInputStep` (controlnet_input) prepares the controlnet input.\n" + + " - `QwenImageAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n" + + " - `QwenImageOptionalControlNetBeforeDenoiseStep` (controlnet_before_denoise) prepares the controlnet input for the denoising step.\n" + + " - `QwenImageAutoDenoiseStep` (denoise) iteratively denoises the latents.\n" + + " - `QwenImageAutoDecodeStep` (decode) decodes the latents into images.\n\n" + + "This step support text-to-image, image-to-image, inpainting, and controlnet tasks for QwenImage:\n" + + " - for image-to-image generation, you need to provide `image_latents`\n" + + " - for inpainting, you need to provide `processed_mask_image` and `image_latents`\n" + + " - to run the controlnet workflow, you need to provide `control_image_latents`\n" + + " - for text-to-image generation, all you need to provide is prompt embeddings" + ) + + ## 1.10 QwenImage/auto block & presets AUTO_BLOCKS = InsertableDict( [ ("text_encoder", QwenImageTextEncoderStep()), ("vae_encoder", QwenImageAutoVaeEncoderStep()), ("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()), - ("input", QwenImageAutoInputStep()), - ("controlnet_input", QwenImageOptionalControlNetInputStep()), - ("before_denoise", QwenImageAutoBeforeDenoiseStep()), - ("controlnet_before_denoise", QwenImageOptionalControlNetBeforeDenoiseStep()), - ("denoise", QwenImageAutoDenoiseStep()), + ("denoise", QwenImageCoreDenoiseStep()), ("decode", QwenImageAutoDecodeStep()), ] ) @@ -699,7 +724,7 @@ def description(self): class QwenImageEditAutoInputStep(AutoPipelineBlocks): block_classes = [QwenImageInpaintInputStep, QwenImageEditInputStep] block_names = ["edit_inpaint", "edit"] - block_trigger_inputs = ["processed_mask_image", "image"] + block_trigger_inputs = ["processed_mask_image", "image_latents"] @property def description(self): @@ -800,13 +825,34 @@ def description(self): ## 2.7 QwenImage-Edit/auto blocks & presets + +class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage-edit" + block_classes = [ + QwenImageEditAutoInputStep, + QwenImageEditAutoBeforeDenoiseStep, + QwenImageEditAutoDenoiseStep, + ] + block_names = ["input", "before_denoise", "denoise"] + + @property + def description(self): + return ( + "Core step that performs the denoising process. \n" + + " - `QwenImageEditAutoInputStep` (input) standardizes the inputs for the denoising step.\n" + + " - `QwenImageEditAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n" + + " - `QwenImageEditAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n" + + "This step support edit (img2img) and edit inpainting workflow for QwenImage Edit:\n" + + " - When `processed_mask_image` is provided, it will be used for edit inpainting task.\n" + + " - When `image_latents` is provided, it will be used for edit (img2img) task.\n" + ) + + EDIT_AUTO_BLOCKS = InsertableDict( [ ("text_encoder", QwenImageEditVLEncoderStep()), ("vae_encoder", QwenImageEditAutoVaeEncoderStep()), - ("input", QwenImageEditAutoInputStep()), - ("before_denoise", QwenImageEditAutoBeforeDenoiseStep()), - ("denoise", QwenImageEditAutoDenoiseStep()), + ("denoise", QwenImageEditCoreDenoiseStep()), ("decode", QwenImageAutoDecodeStep()), ] ) diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py index fe9757f41bcc..3248d131590f 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py @@ -104,6 +104,8 @@ class QwenImageModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin): """ + default_blocks_name = "QwenImageAutoBlocks" + @property def default_height(self): return self.default_sample_size * self.vae_scale_factor @@ -158,6 +160,8 @@ class QwenImageEditModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin): """ + default_blocks_name = "QwenImageEditAutoBlocks" + # YiYi TODO: qwen edit should not provide default height/width, should be derived from the resized input image (after adjustment) produced by the resize step. @property def default_height(self): diff --git a/src/diffusers/modular_pipelines/qwenimage/node_utils.py b/src/diffusers/modular_pipelines/qwenimage/node_utils.py new file mode 100644 index 000000000000..3230ece68abc --- /dev/null +++ b/src/diffusers/modular_pipelines/qwenimage/node_utils.py @@ -0,0 +1,95 @@ +# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# mellon nodes +QwenImage_NODE_TYPES_PARAMS_MAP = { + "controlnet": { + "inputs": [ + "control_image", + "controlnet_conditioning_scale", + "control_guidance_start", + "control_guidance_end", + "height", + "width", + ], + "model_inputs": [ + "controlnet", + "vae", + ], + "outputs": [ + "controlnet_out", + ], + "block_names": ["controlnet_vae_encoder"], + }, + "denoise": { + "inputs": [ + "embeddings", + "width", + "height", + "seed", + "num_inference_steps", + "guidance_scale", + "image_latents", + "strength", + "controlnet", + ], + "model_inputs": [ + "unet", + "guider", + "scheduler", + ], + "outputs": [ + "latents", + "latents_preview", + ], + "block_names": ["denoise"], + }, + "vae_encoder": { + "inputs": [ + "image", + "width", + "height", + ], + "model_inputs": [ + "vae", + ], + "outputs": [ + "image_latents", + ], + }, + "text_encoder": { + "inputs": [ + "prompt", + "negative_prompt", + ], + "model_inputs": [ + "text_encoders", + ], + "outputs": [ + "embeddings", + ], + }, + "decoder": { + "inputs": [ + "latents", + ], + "model_inputs": [ + "vae", + ], + "outputs": [ + "images", + ], + }, +} diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py index fefa622f1a61..70cbf0c1c78d 100644 --- a/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py +++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py @@ -262,37 +262,37 @@ def intermediate_outputs(self) -> List[str]: OutputParam( "prompt_embeds", type_hint=torch.Tensor, - kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields + kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields description="text embeddings used to guide the image generation", ), OutputParam( "negative_prompt_embeds", type_hint=torch.Tensor, - kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields + kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields description="negative text embeddings used to guide the image generation", ), OutputParam( "pooled_prompt_embeds", type_hint=torch.Tensor, - kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields + kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields description="pooled text embeddings used to guide the image generation", ), OutputParam( "negative_pooled_prompt_embeds", type_hint=torch.Tensor, - kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields + kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields description="negative pooled text embeddings used to guide the image generation", ), OutputParam( "ip_adapter_embeds", type_hint=List[torch.Tensor], - kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields + kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields description="image embeddings for IP-Adapter", ), OutputParam( "negative_ip_adapter_embeds", type_hint=List[torch.Tensor], - kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields + kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields description="negative image embeddings for IP-Adapter", ), ] @@ -1120,13 +1120,13 @@ def intermediate_outputs(self) -> List[OutputParam]: OutputParam( "add_time_ids", type_hint=torch.Tensor, - kwargs_type="guider_input_fields", + kwargs_type="denoiser_input_fields", description="The time ids to condition the denoising process", ), OutputParam( "negative_add_time_ids", type_hint=torch.Tensor, - kwargs_type="guider_input_fields", + kwargs_type="denoiser_input_fields", description="The negative time ids to condition the denoising process", ), OutputParam("timestep_cond", type_hint=torch.Tensor, description="The timestep cond to use for LCM"), @@ -1331,13 +1331,13 @@ def intermediate_outputs(self) -> List[OutputParam]: OutputParam( "add_time_ids", type_hint=torch.Tensor, - kwargs_type="guider_input_fields", + kwargs_type="denoiser_input_fields", description="The time ids to condition the denoising process", ), OutputParam( "negative_add_time_ids", type_hint=torch.Tensor, - kwargs_type="guider_input_fields", + kwargs_type="denoiser_input_fields", description="The negative time ids to condition the denoising process", ), OutputParam("timestep_cond", type_hint=torch.Tensor, description="The timestep cond to use for LCM"), diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/denoise.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/denoise.py index a2e142059532..8a8025747332 100644 --- a/src/diffusers/modular_pipelines/stable_diffusion_xl/denoise.py +++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/denoise.py @@ -183,14 +183,14 @@ def inputs(self) -> List[Tuple[str, Any]]: description="The guidance scale embedding to use for Latent Consistency Models(LCMs). Can be generated in prepare_additional_conditioning step.", ), InputParam( - kwargs_type="guider_input_fields", + kwargs_type="denoiser_input_fields", description=( "All conditional model inputs that need to be prepared with guider. " "It should contain prompt_embeds/negative_prompt_embeds, " "add_time_ids/negative_add_time_ids, " "pooled_prompt_embeds/negative_pooled_prompt_embeds, " "and ip_adapter_embeds/negative_ip_adapter_embeds (optional)." - "please add `kwargs_type=guider_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state" + "please add `kwargs_type=denoiser_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state" ), ), ] @@ -307,14 +307,14 @@ def inputs(self) -> List[Tuple[str, Any]]: description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.", ), InputParam( - kwargs_type="guider_input_fields", + kwargs_type="denoiser_input_fields", description=( "All conditional model inputs that need to be prepared with guider. " "It should contain prompt_embeds/negative_prompt_embeds, " "add_time_ids/negative_add_time_ids, " "pooled_prompt_embeds/negative_pooled_prompt_embeds, " "and ip_adapter_embeds/negative_ip_adapter_embeds (optional)." - "please add `kwargs_type=guider_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state" + "please add `kwargs_type=denoiser_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state" ), ), InputParam( diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/encoders.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/encoders.py index 1e8921d363c1..90b254b6f5d4 100644 --- a/src/diffusers/modular_pipelines/stable_diffusion_xl/encoders.py +++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/encoders.py @@ -258,25 +258,25 @@ def intermediate_outputs(self) -> List[OutputParam]: OutputParam( "prompt_embeds", type_hint=torch.Tensor, - kwargs_type="guider_input_fields", + kwargs_type="denoiser_input_fields", description="text embeddings used to guide the image generation", ), OutputParam( "negative_prompt_embeds", type_hint=torch.Tensor, - kwargs_type="guider_input_fields", + kwargs_type="denoiser_input_fields", description="negative text embeddings used to guide the image generation", ), OutputParam( "pooled_prompt_embeds", type_hint=torch.Tensor, - kwargs_type="guider_input_fields", + kwargs_type="denoiser_input_fields", description="pooled text embeddings used to guide the image generation", ), OutputParam( "negative_pooled_prompt_embeds", type_hint=torch.Tensor, - kwargs_type="guider_input_fields", + kwargs_type="denoiser_input_fields", description="negative pooled text embeddings used to guide the image generation", ), ] diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py index c9033856bcc0..68b5e33755b5 100644 --- a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py +++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py @@ -82,19 +82,17 @@ def description(self): # before_denoise: text2img class StableDiffusionXLBeforeDenoiseStep(SequentialPipelineBlocks): block_classes = [ - StableDiffusionXLInputStep, StableDiffusionXLSetTimestepsStep, StableDiffusionXLPrepareLatentsStep, StableDiffusionXLPrepareAdditionalConditioningStep, ] - block_names = ["input", "set_timesteps", "prepare_latents", "prepare_add_cond"] + block_names = ["set_timesteps", "prepare_latents", "prepare_add_cond"] @property def description(self): return ( "Before denoise step that prepare the inputs for the denoise step.\n" + "This is a sequential pipeline blocks:\n" - + " - `StableDiffusionXLInputStep` is used to adjust the batch size of the model inputs\n" + " - `StableDiffusionXLSetTimestepsStep` is used to set the timesteps\n" + " - `StableDiffusionXLPrepareLatentsStep` is used to prepare the latents\n" + " - `StableDiffusionXLPrepareAdditionalConditioningStep` is used to prepare the additional conditioning\n" @@ -104,19 +102,17 @@ def description(self): # before_denoise: img2img class StableDiffusionXLImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks): block_classes = [ - StableDiffusionXLInputStep, StableDiffusionXLImg2ImgSetTimestepsStep, StableDiffusionXLImg2ImgPrepareLatentsStep, StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep, ] - block_names = ["input", "set_timesteps", "prepare_latents", "prepare_add_cond"] + block_names = ["set_timesteps", "prepare_latents", "prepare_add_cond"] @property def description(self): return ( "Before denoise step that prepare the inputs for the denoise step for img2img task.\n" + "This is a sequential pipeline blocks:\n" - + " - `StableDiffusionXLInputStep` is used to adjust the batch size of the model inputs\n" + " - `StableDiffusionXLImg2ImgSetTimestepsStep` is used to set the timesteps\n" + " - `StableDiffusionXLImg2ImgPrepareLatentsStep` is used to prepare the latents\n" + " - `StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep` is used to prepare the additional conditioning\n" @@ -126,19 +122,17 @@ def description(self): # before_denoise: inpainting class StableDiffusionXLInpaintBeforeDenoiseStep(SequentialPipelineBlocks): block_classes = [ - StableDiffusionXLInputStep, StableDiffusionXLImg2ImgSetTimestepsStep, StableDiffusionXLInpaintPrepareLatentsStep, StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep, ] - block_names = ["input", "set_timesteps", "prepare_latents", "prepare_add_cond"] + block_names = ["set_timesteps", "prepare_latents", "prepare_add_cond"] @property def description(self): return ( "Before denoise step that prepare the inputs for the denoise step for inpainting task.\n" + "This is a sequential pipeline blocks:\n" - + " - `StableDiffusionXLInputStep` is used to adjust the batch size of the model inputs\n" + " - `StableDiffusionXLImg2ImgSetTimestepsStep` is used to set the timesteps\n" + " - `StableDiffusionXLInpaintPrepareLatentsStep` is used to prepare the latents\n" + " - `StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep` is used to prepare the additional conditioning\n" @@ -255,25 +249,48 @@ def description(self): ) +class StableDiffusionXLCoreDenoiseStep(SequentialPipelineBlocks): + block_classes = [ + StableDiffusionXLInputStep, + StableDiffusionXLAutoBeforeDenoiseStep, + StableDiffusionXLAutoControlNetInputStep, + StableDiffusionXLAutoDenoiseStep, + ] + block_names = ["input", "before_denoise", "controlnet_input", "denoise"] + + @property + def description(self): + return ( + "Core step that performs the denoising process. \n" + + " - `StableDiffusionXLInputStep` (input) standardizes the inputs for the denoising step.\n" + + " - `StableDiffusionXLAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n" + + " - `StableDiffusionXLAutoControlNetInputStep` (controlnet_input) prepares the controlnet input.\n" + + " - `StableDiffusionXLAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n" + + "This step support text-to-image, image-to-image, inpainting, with or without controlnet/controlnet_union/ip_adapter for Stable Diffusion XL:\n" + + "- for image-to-image generation, you need to provide `image_latents`\n" + + "- for inpainting, you need to provide `mask_image` and `image_latents`\n" + + "- to run the controlnet workflow, you need to provide `control_image`\n" + + "- to run the controlnet_union workflow, you need to provide `control_image` and `control_mode`\n" + + "- to run the ip_adapter workflow, you need to load ip_adapter into your unet and provide `ip_adapter_embeds`\n" + + "- for text-to-image generation, all you need to provide is prompt embeddings\n" + ) + + # ip-adapter, controlnet, text2img, img2img, inpainting class StableDiffusionXLAutoBlocks(SequentialPipelineBlocks): block_classes = [ StableDiffusionXLTextEncoderStep, StableDiffusionXLAutoIPAdapterStep, StableDiffusionXLAutoVaeEncoderStep, - StableDiffusionXLAutoBeforeDenoiseStep, - StableDiffusionXLAutoControlNetInputStep, - StableDiffusionXLAutoDenoiseStep, + StableDiffusionXLCoreDenoiseStep, StableDiffusionXLAutoDecodeStep, ] block_names = [ "text_encoder", "ip_adapter", - "image_encoder", - "before_denoise", - "controlnet_input", + "vae_encoder", "denoise", - "decoder", + "decode", ] @property @@ -321,7 +338,7 @@ def description(self): IMAGE2IMAGE_BLOCKS = InsertableDict( [ ("text_encoder", StableDiffusionXLTextEncoderStep), - ("image_encoder", StableDiffusionXLVaeEncoderStep), + ("vae_encoder", StableDiffusionXLVaeEncoderStep), ("input", StableDiffusionXLInputStep), ("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep), ("prepare_latents", StableDiffusionXLImg2ImgPrepareLatentsStep), @@ -334,7 +351,7 @@ def description(self): INPAINT_BLOCKS = InsertableDict( [ ("text_encoder", StableDiffusionXLTextEncoderStep), - ("image_encoder", StableDiffusionXLInpaintVaeEncoderStep), + ("vae_encoder", StableDiffusionXLInpaintVaeEncoderStep), ("input", StableDiffusionXLInputStep), ("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep), ("prepare_latents", StableDiffusionXLInpaintPrepareLatentsStep), @@ -361,10 +378,8 @@ def description(self): [ ("text_encoder", StableDiffusionXLTextEncoderStep), ("ip_adapter", StableDiffusionXLAutoIPAdapterStep), - ("image_encoder", StableDiffusionXLAutoVaeEncoderStep), - ("before_denoise", StableDiffusionXLAutoBeforeDenoiseStep), - ("controlnet_input", StableDiffusionXLAutoControlNetInputStep), - ("denoise", StableDiffusionXLAutoDenoiseStep), + ("vae_encoder", StableDiffusionXLAutoVaeEncoderStep), + ("denoise", StableDiffusionXLCoreDenoiseStep), ("decode", StableDiffusionXLAutoDecodeStep), ] ) diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py index e84f5cad1ab4..29a717f72e59 100644 --- a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py @@ -54,6 +54,8 @@ class StableDiffusionXLModularPipeline( """ + default_blocks_name = "StableDiffusionXLAutoBlocks" + @property def default_height(self): return self.default_sample_size * self.vae_scale_factor diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/node_utils.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/node_utils.py new file mode 100644 index 000000000000..3e788bf94741 --- /dev/null +++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/node_utils.py @@ -0,0 +1,99 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +SDXL_NODE_TYPES_PARAMS_MAP = { + "controlnet": { + "inputs": [ + "control_image", + "controlnet_conditioning_scale", + "control_guidance_start", + "control_guidance_end", + "height", + "width", + ], + "model_inputs": [ + "controlnet", + ], + "outputs": [ + "controlnet_out", + ], + "block_names": [None], + }, + "denoise": { + "inputs": [ + "embeddings", + "width", + "height", + "seed", + "num_inference_steps", + "guidance_scale", + "image_latents", + "strength", + # custom adapters coming in as inputs + "controlnet", + # ip_adapter is optional and custom; include if available + "ip_adapter", + ], + "model_inputs": [ + "unet", + "guider", + "scheduler", + ], + "outputs": [ + "latents", + "latents_preview", + ], + "block_names": ["denoise"], + }, + "vae_encoder": { + "inputs": [ + "image", + "width", + "height", + ], + "model_inputs": [ + "vae", + ], + "outputs": [ + "image_latents", + ], + "block_names": ["vae_encoder"], + }, + "text_encoder": { + "inputs": [ + "prompt", + "negative_prompt", + ], + "model_inputs": [ + "text_encoders", + ], + "outputs": [ + "embeddings", + ], + "block_names": ["text_encoder"], + }, + "decoder": { + "inputs": [ + "latents", + ], + "model_inputs": [ + "vae", + ], + "outputs": [ + "images", + ], + "block_names": ["decode"], + }, +} diff --git a/src/diffusers/modular_pipelines/wan/before_denoise.py b/src/diffusers/modular_pipelines/wan/before_denoise.py index 2b9889f8778a..d48f678edd59 100644 --- a/src/diffusers/modular_pipelines/wan/before_denoise.py +++ b/src/diffusers/modular_pipelines/wan/before_denoise.py @@ -146,13 +146,13 @@ def intermediate_outputs(self) -> List[str]: OutputParam( "prompt_embeds", type_hint=torch.Tensor, - kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields + kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields description="text embeddings used to guide the image generation", ), OutputParam( "negative_prompt_embeds", type_hint=torch.Tensor, - kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields + kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields description="negative text embeddings used to guide the image generation", ), ] diff --git a/src/diffusers/modular_pipelines/wan/denoise.py b/src/diffusers/modular_pipelines/wan/denoise.py index 5f578609c24f..66c51493bd6a 100644 --- a/src/diffusers/modular_pipelines/wan/denoise.py +++ b/src/diffusers/modular_pipelines/wan/denoise.py @@ -79,11 +79,11 @@ def intermediate_inputs(self) -> List[str]: description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.", ), InputParam( - kwargs_type="guider_input_fields", + kwargs_type="denoiser_input_fields", description=( "All conditional model inputs that need to be prepared with guider. " "It should contain prompt_embeds/negative_prompt_embeds. " - "Please add `kwargs_type=guider_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state" + "Please add `kwargs_type=denoiser_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state" ), ), ] diff --git a/src/diffusers/modular_pipelines/wan/encoders.py b/src/diffusers/modular_pipelines/wan/encoders.py index a0bf76b99b55..cb2fc242383c 100644 --- a/src/diffusers/modular_pipelines/wan/encoders.py +++ b/src/diffusers/modular_pipelines/wan/encoders.py @@ -89,13 +89,13 @@ def intermediate_outputs(self) -> List[OutputParam]: OutputParam( "prompt_embeds", type_hint=torch.Tensor, - kwargs_type="guider_input_fields", + kwargs_type="denoiser_input_fields", description="text embeddings used to guide the image generation", ), OutputParam( "negative_prompt_embeds", type_hint=torch.Tensor, - kwargs_type="guider_input_fields", + kwargs_type="denoiser_input_fields", description="negative text embeddings used to guide the image generation", ), ] diff --git a/src/diffusers/modular_pipelines/wan/modular_pipeline.py b/src/diffusers/modular_pipelines/wan/modular_pipeline.py index 4d86e0d08e59..da4aada43839 100644 --- a/src/diffusers/modular_pipelines/wan/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/wan/modular_pipeline.py @@ -37,6 +37,8 @@ class WanModularPipeline( """ + default_blocks_name = "WanAutoBlocks" + @property def default_height(self): return self.default_sample_height * self.vae_scale_factor_spatial