|  | 
| 68 | 68 | EXAMPLE_DOC_STRING = """ | 
| 69 | 69 |     Examples: | 
| 70 | 70 |         ```py | 
| 71 |  | -        >>> # pip install accelerate transformers safetensors diffusers | 
| 72 |  | -
 | 
|  | 71 | +        # !pip install controlnet_aux | 
|  | 72 | +        >>> from diffusers import ( | 
|  | 73 | +        ...     StableDiffusionXLControlNetUnionImg2ImgPipeline, | 
|  | 74 | +        ...     ControlNetUnionModel, | 
|  | 75 | +        ...     AutoencoderKL, | 
|  | 76 | +        ... ) | 
|  | 77 | +        >>> from diffusers.models.controlnets import ControlNetUnionInputProMax | 
|  | 78 | +        >>> from diffusers.utils import load_image | 
| 73 | 79 |         >>> import torch | 
| 74 |  | -        >>> import numpy as np | 
| 75 | 80 |         >>> from PIL import Image | 
| 76 |  | -
 | 
| 77 |  | -        >>> from transformers import DPTImageProcessor, DPTForDepthEstimation | 
| 78 |  | -        >>> from diffusers import ControlNetModel, StableDiffusionXLControlNetImg2ImgPipeline, AutoencoderKL | 
| 79 |  | -        >>> from diffusers.utils import load_image | 
| 80 |  | -
 | 
| 81 |  | -
 | 
| 82 |  | -        >>> depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda") | 
| 83 |  | -        >>> feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas") | 
| 84 |  | -        >>> controlnet = ControlNetModel.from_pretrained( | 
| 85 |  | -        ...     "diffusers/controlnet-depth-sdxl-1.0-small", | 
| 86 |  | -        ...     variant="fp16", | 
| 87 |  | -        ...     use_safetensors=True, | 
| 88 |  | -        ...     torch_dtype=torch.float16, | 
|  | 81 | +        >>> import numpy as np | 
|  | 82 | +        >>> prompt = "A cat" | 
|  | 83 | +        >>> # download an image | 
|  | 84 | +        >>> image = load_image( | 
|  | 85 | +        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png" | 
| 89 | 86 |         ... ) | 
| 90 |  | -        >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16) | 
| 91 |  | -        >>> pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained( | 
|  | 87 | +        >>> # initialize the models and pipeline | 
|  | 88 | +        >>> controlnet = ControlNetUnionModel.from_pretrained( | 
|  | 89 | +        ...     "brad-twinkl/controlnet-union-sdxl-1.0-promax", torch_dtype=torch.float16 | 
|  | 90 | +        ... ) | 
|  | 91 | +        >>> vae = AutoencoderKL.from_pretrained( | 
|  | 92 | +        ...     "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16 | 
|  | 93 | +        ... ) | 
|  | 94 | +        >>> pipe = StableDiffusionXLControlNetUnionImg2ImgPipeline.from_pretrained( | 
| 92 | 95 |         ...     "stabilityai/stable-diffusion-xl-base-1.0", | 
| 93 | 96 |         ...     controlnet=controlnet, | 
| 94 | 97 |         ...     vae=vae, | 
| 95 |  | -        ...     variant="fp16", | 
| 96 |  | -        ...     use_safetensors=True, | 
| 97 | 98 |         ...     torch_dtype=torch.float16, | 
| 98 |  | -        ... ) | 
| 99 |  | -        >>> pipe.enable_model_cpu_offload() | 
| 100 |  | -
 | 
| 101 |  | -
 | 
| 102 |  | -        >>> def get_depth_map(image): | 
| 103 |  | -        ...     image = feature_extractor(images=image, return_tensors="pt").pixel_values.to("cuda") | 
| 104 |  | -        ...     with torch.no_grad(), torch.autocast("cuda"): | 
| 105 |  | -        ...         depth_map = depth_estimator(image).predicted_depth | 
| 106 |  | -
 | 
| 107 |  | -        ...     depth_map = torch.nn.functional.interpolate( | 
| 108 |  | -        ...         depth_map.unsqueeze(1), | 
| 109 |  | -        ...         size=(1024, 1024), | 
| 110 |  | -        ...         mode="bicubic", | 
| 111 |  | -        ...         align_corners=False, | 
|  | 99 | +        ... ).to("cuda") | 
|  | 100 | +        >>> # `enable_model_cpu_offload` is not recommended due to multiple generations | 
|  | 101 | +        >>> height = image.height | 
|  | 102 | +        >>> width = image.width | 
|  | 103 | +        >>> ratio = np.sqrt(1024.0 * 1024.0 / (width * height)) | 
|  | 104 | +        >>> # 3 * 3 upscale correspond to 16 * 3 multiply, 2 * 2 correspond to 16 * 2 multiply and so on. | 
|  | 105 | +        >>> scale_image_factor = 3 | 
|  | 106 | +        >>> base_factor = 16 | 
|  | 107 | +        >>> factor = scale_image_factor * base_factor | 
|  | 108 | +        >>> W, H = int(width * ratio) // factor * factor, int(height * ratio) // factor * factor | 
|  | 109 | +        >>> image = image.resize((W, H)) | 
|  | 110 | +        >>> target_width = W // scale_image_factor | 
|  | 111 | +        >>> target_height = H // scale_image_factor | 
|  | 112 | +        >>> images = [] | 
|  | 113 | +        >>> crops_coords_list = [ | 
|  | 114 | +        ...     (0, 0), | 
|  | 115 | +        ...     (0, width // 2), | 
|  | 116 | +        ...     (height // 2, 0), | 
|  | 117 | +        ...     (width // 2, height // 2), | 
|  | 118 | +        ...     0, | 
|  | 119 | +        ...     0, | 
|  | 120 | +        ...     0, | 
|  | 121 | +        ...     0, | 
|  | 122 | +        ...     0, | 
|  | 123 | +        ... ] | 
|  | 124 | +        >>> for i in range(scale_image_factor): | 
|  | 125 | +        ...     for j in range(scale_image_factor): | 
|  | 126 | +        ...         left = j * target_width | 
|  | 127 | +        ...         top = i * target_height | 
|  | 128 | +        ...         right = left + target_width | 
|  | 129 | +        ...         bottom = top + target_height | 
|  | 130 | +        ...         cropped_image = image.crop((left, top, right, bottom)) | 
|  | 131 | +        ...         cropped_image = cropped_image.resize((W, H)) | 
|  | 132 | +        ...         images.append(cropped_image) | 
|  | 133 | +        >>> # set ControlNetUnion input | 
|  | 134 | +        >>> result_images = [] | 
|  | 135 | +        >>> for sub_img, crops_coords in zip(images, crops_coords_list): | 
|  | 136 | +        ...     union_input = ControlNetUnionInputProMax( | 
|  | 137 | +        ...         tile=sub_img, | 
| 112 | 138 |         ...     ) | 
| 113 |  | -        ...     depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True) | 
| 114 |  | -        ...     depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True) | 
| 115 |  | -        ...     depth_map = (depth_map - depth_min) / (depth_max - depth_min) | 
| 116 |  | -        ...     image = torch.cat([depth_map] * 3, dim=1) | 
| 117 |  | -        ...     image = image.permute(0, 2, 3, 1).cpu().numpy()[0] | 
| 118 |  | -        ...     image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8)) | 
| 119 |  | -        ...     return image | 
| 120 |  | -
 | 
| 121 |  | -
 | 
| 122 |  | -        >>> prompt = "A robot, 4k photo" | 
| 123 |  | -        >>> image = load_image( | 
| 124 |  | -        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" | 
| 125 |  | -        ...     "/kandinsky/cat.png" | 
| 126 |  | -        ... ).resize((1024, 1024)) | 
| 127 |  | -        >>> controlnet_conditioning_scale = 0.5  # recommended for good generalization | 
| 128 |  | -        >>> depth_image = get_depth_map(image) | 
| 129 |  | -
 | 
| 130 |  | -        >>> images = pipe( | 
| 131 |  | -        ...     prompt, | 
| 132 |  | -        ...     image=image, | 
| 133 |  | -        ...     control_image=depth_image, | 
| 134 |  | -        ...     strength=0.99, | 
| 135 |  | -        ...     num_inference_steps=50, | 
| 136 |  | -        ...     controlnet_conditioning_scale=controlnet_conditioning_scale, | 
| 137 |  | -        ... ).images | 
| 138 |  | -        >>> images[0].save(f"robot_cat.png") | 
|  | 139 | +        ...     new_width, new_height = W, H | 
|  | 140 | +        ...     out = pipe( | 
|  | 141 | +        ...         prompt=[prompt] * 1, | 
|  | 142 | +        ...         image=sub_img, | 
|  | 143 | +        ...         control_image_list=union_input, | 
|  | 144 | +        ...         width=new_width, | 
|  | 145 | +        ...         height=new_height, | 
|  | 146 | +        ...         num_inference_steps=30, | 
|  | 147 | +        ...         crops_coords_top_left=(W, H), | 
|  | 148 | +        ...         target_size=(W, H), | 
|  | 149 | +        ...         original_size=(W * 2, H * 2), | 
|  | 150 | +        ...     ) | 
|  | 151 | +        ...     result_images.append(out.images[0]) | 
|  | 152 | +        >>> new_im = Image.new( | 
|  | 153 | +        ...     "RGB", (new_width * scale_image_factor, new_height * scale_image_factor) | 
|  | 154 | +        ... ) | 
|  | 155 | +        >>> new_im.paste(result_images[0], (0, 0)) | 
|  | 156 | +        >>> new_im.paste(result_images[1], (new_width, 0)) | 
|  | 157 | +        >>> new_im.paste(result_images[2], (new_width * 2, 0)) | 
|  | 158 | +        >>> new_im.paste(result_images[3], (0, new_height)) | 
|  | 159 | +        >>> new_im.paste(result_images[4], (new_width, new_height)) | 
|  | 160 | +        >>> new_im.paste(result_images[5], (new_width * 2, new_height)) | 
|  | 161 | +        >>> new_im.paste(result_images[6], (0, new_height * 2)) | 
|  | 162 | +        >>> new_im.paste(result_images[7], (new_width, new_height * 2)) | 
|  | 163 | +        >>> new_im.paste(result_images[8], (new_width * 2, new_height * 2)) | 
| 139 | 164 |         ``` | 
| 140 | 165 | """ | 
| 141 | 166 | 
 | 
|  | 
0 commit comments