huggingface
diff --git a/‎docs/source/en/_toctree.yml‎
Lines changed: 2 additions & 0 deletions b/‎docs/source/en/_toctree.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/source/en/api/pipelines/control_flux_inpaint.md‎
Lines changed: 89 additions & 0 deletions b/‎docs/source/en/api/pipelines/control_flux_inpaint.md‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎docs/source/en/quantization/gguf.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/source/en/quantization/gguf.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source/en/quantization/overview.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/source/en/quantization/overview.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/diffusers/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/diffusers/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/diffusers/loaders/single_file_utils.py‎
Lines changed: 9 additions & 1 deletion b/‎src/diffusers/loaders/single_file_utils.py‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎src/diffusers/models/embeddings.py‎
Lines changed: 1 addition & 1 deletion b/‎src/diffusers/models/embeddings.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/diffusers/pipelines/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/diffusers/pipelines/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/diffusers/pipelines/flux/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/diffusers/pipelines/flux/__init__.py‎
Lines changed: 2 additions & 0 deletions
@@ -400,6 +400,8 @@
       title: DiT
     - local: api/pipelines/flux
       title: Flux
+    - local: api/pipelines/control_flux_inpaint
+      title: FluxControlInpaint
     - local: api/pipelines/hunyuandit
       title: Hunyuan-DiT
     - local: api/pipelines/hunyuan_video
 
@@ -0,0 +1,89 @@
+<!--Copyright 2024 The HuggingFace Team, The Black Forest Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# FluxControlInpaint
+
+FluxControlInpaintPipeline is an implementation of Inpainting for Flux.1 Depth/Canny models. It is a pipeline that allows you to inpaint images using the Flux.1 Depth/Canny models. The pipeline takes an image and a mask as input and returns the inpainted image.
+
+FLUX.1 Depth and Canny [dev] is a 12 billion parameter rectified flow transformer capable of generating an image based on a text description while following the structure of a given input image. **This is not a ControlNet model**.
+
+| Control type | Developer | Link |
+| -------- | ---------- | ---- |
+| Depth | [Black Forest Labs](https://huggingface.co/black-forest-labs) | [Link](https://huggingface.co/black-forest-labs/FLUX.1-Depth-dev) |
+| Canny | [Black Forest Labs](https://huggingface.co/black-forest-labs) | [Link](https://huggingface.co/black-forest-labs/FLUX.1-Canny-dev) |
+
+
+<Tip>
+
+Flux can be quite expensive to run on consumer hardware devices. However, you can perform a suite of optimizations to run it faster and in a more memory-friendly manner. Check out [this section](https://huggingface.co/blog/sd3#memory-optimizations-for-sd3) for more details. Additionally, Flux can benefit from quantization for memory efficiency with a trade-off in inference latency. Refer to [this blog post](https://huggingface.co/blog/quanto-diffusers) to learn more. For an exhaustive list of resources, check out [this gist](https://gist.github.com/sayakpaul/b664605caf0aa3bf8585ab109dd5ac9c).
+
+</Tip>
+
+```python
+import torch
+from diffusers import FluxControlInpaintPipeline
+from diffusers.models.transformers import FluxTransformer2DModel
+from transformers import T5EncoderModel
+from diffusers.utils import load_image, make_image_grid
+from image_gen_aux import DepthPreprocessor # https://github.com/huggingface/image_gen_aux
+from PIL import Image
+import numpy as np
+
+pipe = FluxControlInpaintPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-Depth-dev",
+    torch_dtype=torch.bfloat16,
+)
+# use following lines if you have GPU constraints
+# ---------------------------------------------------------------
+transformer = FluxTransformer2DModel.from_pretrained(
+    "sayakpaul/FLUX.1-Depth-dev-nf4", subfolder="transformer", torch_dtype=torch.bfloat16
+)
+text_encoder_2 = T5EncoderModel.from_pretrained(
+    "sayakpaul/FLUX.1-Depth-dev-nf4", subfolder="text_encoder_2", torch_dtype=torch.bfloat16
+)
+pipe.transformer = transformer
+pipe.text_encoder_2 = text_encoder_2
+pipe.enable_model_cpu_offload()
+# ---------------------------------------------------------------
+pipe.to("cuda")
+
+prompt = "a blue robot singing opera with human-like expressions"
+image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/robot.png")
+
+head_mask = np.zeros_like(image)
+head_mask[65:580,300:642] = 255
+mask_image = Image.fromarray(head_mask)
+
+processor = DepthPreprocessor.from_pretrained("LiheYoung/depth-anything-large-hf")
+control_image = processor(image)[0].convert("RGB")
+
+output = pipe(
+    prompt=prompt,
+    image=image,
+    control_image=control_image,
+    mask_image=mask_image,
+    num_inference_steps=30,
+    strength=0.9,
+    guidance_scale=10.0,
+    generator=torch.Generator().manual_seed(42),
+).images[0]
+make_image_grid([image, control_image, mask_image, output.resize(image.size)], rows=1, cols=4).save("output.png")
+```
+
+## FluxControlInpaintPipeline
+[[autodoc]] FluxControlInpaintPipeline
+	- all
+	- __call__
+
+
+## FluxPipelineOutput
+[[autodoc]] pipelines.flux.pipeline_output.FluxPipelineOutput
@@ -25,9 +25,9 @@ pip install -U gguf
 
 Since GGUF is a single file format, use [`~FromSingleFileMixin.from_single_file`] to load the model and pass in the [`GGUFQuantizationConfig`].
 
-When using GGUF checkpoints, the quantized weights remain in a low memory `dtype`(typically `torch.unint8`) and are dynamically dequantized and cast to the configured `compute_dtype` during each module's forward pass through the model. The `GGUFQuantizationConfig` allows you to set the `compute_dtype`. 
+When using GGUF checkpoints, the quantized weights remain in a low memory `dtype`(typically `torch.uint8`) and are dynamically dequantized and cast to the configured `compute_dtype` during each module's forward pass through the model. The `GGUFQuantizationConfig` allows you to set the `compute_dtype`.
 
-The functions used for dynamic dequantizatation are based on the great work done by [city96](https://github.com/city96/ComfyUI-GGUF), who created the Pytorch ports of the original (`numpy`)[https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/quants.py] implementation by [compilade](https://github.com/compilade).
+The functions used for dynamic dequantizatation are based on the great work done by [city96](https://github.com/city96/ComfyUI-GGUF), who created the Pytorch ports of the original [`numpy`](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/quants.py) implementation by [compilade](https://github.com/compilade).
 
 ```python
 import torch
 
@@ -33,8 +33,8 @@ If you are new to the quantization field, we recommend you to check out these be
 ## When to use what?
 
 Diffusers currently supports the following quantization methods.
-- [BitsandBytes]()
-- [TorchAO]()
-- [GGUF]()
+- [BitsandBytes](./bitsandbytes.md)
+- [TorchAO](./torchao.md)
+- [GGUF](./gguf.md)
 
 [This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques.
@@ -277,6 +277,7 @@
             "CogView3PlusPipeline",
             "CycleDiffusionPipeline",
             "FluxControlImg2ImgPipeline",
+            "FluxControlInpaintPipeline",
             "FluxControlNetImg2ImgPipeline",
             "FluxControlNetInpaintPipeline",
             "FluxControlNetPipeline",
@@ -765,6 +766,7 @@
             CogView3PlusPipeline,
             CycleDiffusionPipeline,
             FluxControlImg2ImgPipeline,
+            FluxControlInpaintPipeline,
             FluxControlNetImg2ImgPipeline,
             FluxControlNetInpaintPipeline,
             FluxControlNetPipeline,
 
@@ -151,6 +151,8 @@
     "animatediff_scribble": {"pretrained_model_name_or_path": "guoyww/animatediff-sparsectrl-scribble"},
     "animatediff_rgb": {"pretrained_model_name_or_path": "guoyww/animatediff-sparsectrl-rgb"},
     "flux-dev": {"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-dev"},
+    "flux-fill": {"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-Fill-dev"},
+    "flux-depth": {"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-Depth-dev"},
     "flux-schnell": {"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-schnell"},
     "ltx-video": {"pretrained_model_name_or_path": "Lightricks/LTX-Video"},
     "autoencoder-dc-f128c512": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f128c512-mix-1.0-diffusers"},
@@ -587,7 +589,13 @@ def infer_diffusers_model_type(checkpoint):
         if any(
             g in checkpoint for g in ["guidance_in.in_layer.bias", "model.diffusion_model.guidance_in.in_layer.bias"]
         ):
-            model_type = "flux-dev"
+            if checkpoint["img_in.weight"].shape[1] == 384:
+                model_type = "flux-fill"
+
+            elif checkpoint["img_in.weight"].shape[1] == 128:
+                model_type = "flux-depth"
+            else:
+                model_type = "flux-dev"
         else:
             model_type = "flux-schnell"
 
 
@@ -691,7 +691,7 @@ def _get_positional_embeddings(
             output_type="pt",
         )
         pos_embedding = pos_embedding.flatten(0, 1)
-        joint_pos_embedding = torch.zeros(
+        joint_pos_embedding = pos_embedding.new_zeros(
             1, self.max_text_seq_length + num_patches, self.embed_dim, requires_grad=False
         )
         joint_pos_embedding.data[:, self.max_text_seq_length :].copy_(pos_embedding)
 
@@ -128,6 +128,7 @@
     ]
     _import_structure["flux"] = [
         "FluxControlPipeline",
+        "FluxControlInpaintPipeline",
         "FluxControlImg2ImgPipeline",
         "FluxControlNetPipeline",
         "FluxControlNetImg2ImgPipeline",
@@ -539,6 +540,7 @@
         )
         from .flux import (
             FluxControlImg2ImgPipeline,
+            FluxControlInpaintPipeline,
             FluxControlNetImg2ImgPipeline,
             FluxControlNetInpaintPipeline,
             FluxControlNetPipeline,
 
@@ -26,6 +26,7 @@
     _import_structure["pipeline_flux"] = ["FluxPipeline"]
     _import_structure["pipeline_flux_control"] = ["FluxControlPipeline"]
     _import_structure["pipeline_flux_control_img2img"] = ["FluxControlImg2ImgPipeline"]
+    _import_structure["pipeline_flux_control_inpaint"] = ["FluxControlInpaintPipeline"]
     _import_structure["pipeline_flux_controlnet"] = ["FluxControlNetPipeline"]
     _import_structure["pipeline_flux_controlnet_image_to_image"] = ["FluxControlNetImg2ImgPipeline"]
     _import_structure["pipeline_flux_controlnet_inpainting"] = ["FluxControlNetInpaintPipeline"]
@@ -44,6 +45,7 @@
         from .pipeline_flux import FluxPipeline
         from .pipeline_flux_control import FluxControlPipeline
         from .pipeline_flux_control_img2img import FluxControlImg2ImgPipeline
+        from .pipeline_flux_control_inpaint import FluxControlInpaintPipeline
         from .pipeline_flux_controlnet import FluxControlNetPipeline
         from .pipeline_flux_controlnet_image_to_image import FluxControlNetImg2ImgPipeline
         from .pipeline_flux_controlnet_inpainting import FluxControlNetInpaintPipeline
Original file line number	Diff line number	Diff line change
`@@ -691,7 +691,7 @@ def _get_positional_embeddings(`
`691`	`691`	`output_type="pt",`
`692`	`692`	`)`
`693`	`693`	`pos_embedding = pos_embedding.flatten(0, 1)`
`694`		`- joint_pos_embedding = torch.zeros(`
	`694`	`+ joint_pos_embedding = pos_embedding.new_zeros(`
`695`	`695`	`1, self.max_text_seq_length + num_patches, self.embed_dim, requires_grad=False`
`696`	`696`	`)`
`697`	`697`	`joint_pos_embedding.data[:, self.max_text_seq_length :].copy_(pos_embedding)`