huggingface · stevhliu · Nov 19, 2024 · Nov 19, 2024 · Mar 3, 2024 · Mar 3, 2024
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -25,6 +25,7 @@
     title: Working with big models
   title: Tutorials
 - sections:
+<<<<<<< HEAD
   - local: using-diffusers/loading
     title: Load pipelines
   - local: using-diffusers/custom_pipeline_overview
@@ -38,6 +39,137 @@
   - local: using-diffusers/push_to_hub
     title: Push files to the Hub
   title: Load pipelines and adapters
+=======
+  - sections:
+    - local: using-diffusers/loading_overview
+      title: Overview
+    - local: using-diffusers/loading
+      title: Load pipelines, models, and schedulers
+    - local: using-diffusers/schedulers
+      title: Load and compare different schedulers
+    - local: using-diffusers/custom_pipeline_overview
+      title: Load community pipelines and components
+    - local: using-diffusers/using_safetensors
+      title: Load safetensors
+    - local: using-diffusers/other-formats
+      title: Load different Stable Diffusion formats
+    - local: using-diffusers/loading_adapters
+      title: Load adapters
+    - local: using-diffusers/push_to_hub
+      title: Push files to the Hub
+    title: Loading & Hub
+  - sections:
+    - local: using-diffusers/pipeline_overview
+      title: Overview
+    - local: using-diffusers/unconditional_image_generation
+      title: Unconditional image generation
+    - local: using-diffusers/conditional_image_generation
+      title: Text-to-image
+    - local: using-diffusers/img2img
+      title: Image-to-image
+    - local: using-diffusers/inpaint
+      title: Inpainting
+    - local: using-diffusers/text-img2vid
+      title: Text or image-to-video
+    - local: using-diffusers/depth2img
+      title: Depth-to-image
+    title: Tasks
+  - sections:
+    - local: using-diffusers/textual_inversion_inference
+      title: Textual inversion
+    - local: using-diffusers/ip_adapter
+      title: IP-Adapter
+    - local: training/distributed_inference
+      title: Distributed inference with multiple GPUs
+    - local: using-diffusers/reusing_seeds
+      title: Improve image quality with deterministic generation
+    - local: using-diffusers/control_brightness
+      title: Control image brightness
+    - local: using-diffusers/weighted_prompts
+      title: Prompt weighting
+    - local: using-diffusers/freeu
+      title: Improve generation quality with FreeU
+    title: Techniques
+  - sections:
+    - local: using-diffusers/pipeline_overview
+      title: Overview
+    - local: using-diffusers/sdxl
+      title: Stable Diffusion XL
+    - local: using-diffusers/sdxl_turbo
+      title: SDXL Turbo
+    - local: using-diffusers/kandinsky
+      title: Kandinsky
+    - local: using-diffusers/controlnet
+      title: ControlNet
+    - local: using-diffusers/shap-e
+      title: Shap-E
+    - local: using-diffusers/diffedit
+      title: DiffEdit
+    - local: using-diffusers/distilled_sd
+      title: Distilled Stable Diffusion inference
+    - local: using-diffusers/callback
+      title: Pipeline callbacks
+    - local: using-diffusers/reproducibility
+      title: Create reproducible pipelines
+    - local: using-diffusers/custom_pipeline_examples
+      title: Community pipelines
+    - local: using-diffusers/contribute_pipeline
+      title: Contribute a community pipeline
+    - local: using-diffusers/inference_with_lcm_lora
+      title: Latent Consistency Model-LoRA
+    - local: using-diffusers/inference_with_lcm
+      title: Latent Consistency Model
+    - local: using-diffusers/svd
+      title: Stable Video Diffusion
+    title: Specific pipeline examples
+  - sections:
+    - local: training/overview
+      title: Overview
+    - local: training/create_dataset
+      title: Create a dataset for training
+    - local: training/adapt_a_model
+      title: Adapt a model to a new task
+    - sections:
+      - local: training/unconditional_training
+        title: Unconditional image generation
+      - local: training/text2image
+        title: Text-to-image
+      - local: training/sdxl
+        title: Stable Diffusion XL
+      - local: training/kandinsky
+        title: Kandinsky 2.2
+      - local: training/wuerstchen
+        title: Wuerstchen
+      - local: training/controlnet
+        title: ControlNet
+      - local: training/t2i_adapters
+        title: T2I-Adapters
+      - local: training/instructpix2pix
+        title: InstructPix2Pix
+      - local: training/ip_adapter
+        title: IP-Adapter
+      title: Models
+    - sections:
+      - local: training/text_inversion
+        title: Textual Inversion
+      - local: training/dreambooth
+        title: DreamBooth
+      - local: training/lora
+        title: LoRA
+      - local: training/custom_diffusion
+        title: Custom Diffusion
+      - local: training/lcm_distill
+        title: Latent Consistency Distillation
+      - local: training/ddpo
+        title: Reinforcement learning training with DDPO
+      title: Methods
+    title: Training
+  - sections:
+    - local: using-diffusers/other-modalities
+      title: Other Modalities
+    title: Taking Diffusers Beyond Images
+  title: Using Diffusers
+>>>>>>> 10ef7614c (docs: add IP Adapter training instructions)
 - sections:
   - local: using-diffusers/unconditional_image_generation
     title: Unconditional image generation

diff --git a/docs/source/en/training/ip_adapter.md b/docs/source/en/training/ip_adapter.md
@@ -0,0 +1,121 @@
+# IP Adapter Training Example 
+
+[IP Adapter](https://arxiv.org/abs/2308.06721) is a novel approach designed to enhance text-to-image models such as Stable Diffusion by enabling them to generate images based on image prompts rather than text prompts alone. Unlike traditional methods that rely solely on complex text prompts, IP Adapter introduces the concept of using image prompts, leveraging the idea that "an image is worth a thousand words." By decoupling cross-attention layers for text and image features, IP Adapter effectively integrates image prompts into the generation process without the need for extensive fine-tuning or large computing resources.
+
+## Training locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the example folder and run
+
+```bash
+pip install -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell e.g. a notebook
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+Certainly! Below is the documentation in pure Markdown format:
+
+### Accelerate Launch Command Documentation
+
+#### Description:
+The Accelerate launch command is used to train a model using multiple GPUs and mixed precision training. It launches the training script `tutorial_train_ip-adapter.py` with specified parameters and configurations.
+
+#### Usage Example:
+```
+accelerate launch --num_processes 8 --multi_gpu --mixed_precision "fp16" \
+  tutorial_train_ip-adapter.py \
+  --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5/" \
+  --image_encoder_path="{image_encoder_path}" \
+  --data_json_file="{data.json}" \
+  --data_root_path="{image_path}" \
+  --mixed_precision="fp16" \
+  --resolution=512 \
+  --train_batch_size=8 \
+  --dataloader_num_workers=4 \
+  --learning_rate=1e-04 \
+  --weight_decay=0.01 \
+  --output_dir="{output_dir}" \
+  --save_steps=10000
+```
+
+#### Parameters:
+- `--num_processes`: Number of processes to launch for distributed training (in this example, 8 processes).
+- `--multi_gpu`: Flag indicating the usage of multiple GPUs for training.
+- `--mixed_precision "fp16"`: Enables mixed precision training with 16-bit floating-point precision.
+- `tutorial_train_ip-adapter.py`: Name of the training script to be executed.
+- `--pretrained_model_name_or_path`: Path or identifier for a pretrained model.
+- `--image_encoder_path`: Path to the CLIP image encoder.
+- `--data_json_file`: Path to the training data in JSON format.
+- `--data_root_path`: Root path where training images are located.
+- `--resolution`: Resolution of input images (512x512 in this example).
+- `--train_batch_size`: Batch size for training data (8 in this example).
+- `--dataloader_num_workers`: Number of subprocesses for data loading (4 in this example).
+- `--learning_rate`: Learning rate for training (1e-04 in this example).
+- `--weight_decay`: Weight decay for regularization (0.01 in this example).
+- `--output_dir`: Directory to save model checkpoints and predictions.
+- `--save_steps`: Frequency of saving checkpoints during training (10000 in this example).
+
+### Inference
+
+#### Description:
+The provided inference code is used to load a trained model checkpoint and extract the components related to image projection and IP (Image Processing) adapter. These components are then saved into a binary file for later use in inference.
+
+#### Usage Example:
+```python
+import torch
+
+# Load the trained model checkpoint
+ckpt = "checkpoint-50000/pytorch_model.bin"
+sd = torch.load(ckpt, map_location="cpu")
+
+# Extract image projection and IP adapter components
+image_proj_sd = {}
+ip_sd = {}
+for k in sd:
+    if k.startswith("unet"):
+        pass
+    elif k.startswith("image_proj_model"):
+        image_proj_sd[k.replace("image_proj_model.", "")] = sd[k]
+    elif k.startswith("adapter_modules"):
+        ip_sd[k.replace("adapter_modules.", "")] = sd[k]
+
+# Save the components into a binary file
+torch.save({"image_proj": image_proj_sd, "ip_adapter": ip_sd}, "ip_adapter.bin")
+```
+
+#### Parameters:
+- `ckpt`: Path to the trained model checkpoint file.
+- `map_location="cpu"`: Specifies that the model should be loaded onto the CPU.
+- `image_proj_sd`: Dictionary to store the components related to image projection.
+- `ip_sd`: Dictionary to store the components related to the IP adapter.
+- `"unet"`, `"image_proj_model"`, `"adapter_modules"`: Prefixes indicating components of the model.
diff --git a/examples/research_projects/ip_adapter/README.md b/examples/research_projects/ip_adapter/README.md
@@ -0,0 +1,121 @@
+# IP Adapter Training Example 
+
+[IP Adapter](https://arxiv.org/abs/2308.06721) is a novel approach designed to enhance text-to-image models such as Stable Diffusion by enabling them to generate images based on image prompts rather than text prompts alone. Unlike traditional methods that rely solely on complex text prompts, IP Adapter introduces the concept of using image prompts, leveraging the idea that "an image is worth a thousand words." By decoupling cross-attention layers for text and image features, IP Adapter effectively integrates image prompts into the generation process without the need for extensive fine-tuning or large computing resources.
+
+## Training locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the example folder and run
+
+```bash
+pip install -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell e.g. a notebook
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+Certainly! Below is the documentation in pure Markdown format:
+
+### Accelerate Launch Command Documentation
+
+#### Description:
+The Accelerate launch command is used to train a model using multiple GPUs and mixed precision training. It launches the training script `tutorial_train_ip-adapter.py` with specified parameters and configurations.
+
+#### Usage Example:
+```
+accelerate launch --num_processes 8 --multi_gpu --mixed_precision "fp16" \
+  tutorial_train_ip-adapter.py \
+  --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5/" \
+  --image_encoder_path="{image_encoder_path}" \
+  --data_json_file="{data.json}" \
+  --data_root_path="{image_path}" \
+  --mixed_precision="fp16" \
+  --resolution=512 \
+  --train_batch_size=8 \
+  --dataloader_num_workers=4 \
+  --learning_rate=1e-04 \
+  --weight_decay=0.01 \
+  --output_dir="{output_dir}" \
+  --save_steps=10000
+```
+
+#### Parameters:
+- `--num_processes`: Number of processes to launch for distributed training (in this example, 8 processes).
+- `--multi_gpu`: Flag indicating the usage of multiple GPUs for training.
+- `--mixed_precision "fp16"`: Enables mixed precision training with 16-bit floating-point precision.
+- `tutorial_train_ip-adapter.py`: Name of the training script to be executed.
+- `--pretrained_model_name_or_path`: Path or identifier for a pretrained model.
+- `--image_encoder_path`: Path to the CLIP image encoder.
+- `--data_json_file`: Path to the training data in JSON format.
+- `--data_root_path`: Root path where training images are located.
+- `--resolution`: Resolution of input images (512x512 in this example).
+- `--train_batch_size`: Batch size for training data (8 in this example).
+- `--dataloader_num_workers`: Number of subprocesses for data loading (4 in this example).
+- `--learning_rate`: Learning rate for training (1e-04 in this example).
+- `--weight_decay`: Weight decay for regularization (0.01 in this example).
+- `--output_dir`: Directory to save model checkpoints and predictions.
+- `--save_steps`: Frequency of saving checkpoints during training (10000 in this example).
+
+### Inference
+
+#### Description:
+The provided inference code is used to load a trained model checkpoint and extract the components related to image projection and IP (Image Processing) adapter. These components are then saved into a binary file for later use in inference.
+
+#### Usage Example:
+```python
+import torch
+
+# Load the trained model checkpoint
+ckpt = "checkpoint-50000/pytorch_model.bin"
+sd = torch.load(ckpt, map_location="cpu")
+
+# Extract image projection and IP adapter components
+image_proj_sd = {}
+ip_sd = {}
+for k in sd:
+    if k.startswith("unet"):
+        pass
+    elif k.startswith("image_proj_model"):
+        image_proj_sd[k.replace("image_proj_model.", "")] = sd[k]
+    elif k.startswith("adapter_modules"):
+        ip_sd[k.replace("adapter_modules.", "")] = sd[k]
+
+# Save the components into a binary file
+torch.save({"image_proj": image_proj_sd, "ip_adapter": ip_sd}, "ip_adapter.bin")
+```
+
+#### Parameters:
+- `ckpt`: Path to the trained model checkpoint file.
+- `map_location="cpu"`: Specifies that the model should be loaded onto the CPU.
+- `image_proj_sd`: Dictionary to store the components related to image projection.
+- `ip_sd`: Dictionary to store the components related to the IP adapter.
+- `"unet"`, `"image_proj_model"`, `"adapter_modules"`: Prefixes indicating components of the model.
diff --git a/examples/research_projects/ip_adapter/requirements.txt b/examples/research_projects/ip_adapter/requirements.txt
@@ -0,0 +1,4 @@
+accelerate
+torchvision
+transformers>=4.25.1
+ip_adapter