Merge branch 'main' into controlnet_num_train_epochs_patch

Bhavay-2001 · web-flow · commit acdd37be9659 · 2024-06-11T23:16:27.000+05:30
diff --git a/examples/research_projects/diffusion_orpo/README.md b/examples/research_projects/diffusion_orpo/README.md
@@ -1,121 +1 @@
-This project is an attempt to check if it's possible to apply to [ORPO](https://arxiv.org/abs/2403.07691) on a text-conditioned diffusion model to align it on preference data WITHOUT a reference model. The implementation is based on https://github.com/huggingface/trl/pull/1435/. 
-
-> [!WARNING] 
-> We assume that MSE in the diffusion formulation approximates the log-probs as required by ORPO (hat-tip to [@kashif](https://github.com/kashif) for the idea). So, please consider this to be extremely experimental.
-
-## Training
-
-Here's training command you can use on a 40GB A100 to validate things on a [small preference
-dataset](https://hf.co/datasets/kashif/pickascore): 
-
-```bash
-accelerate launch train_diffusion_orpo_sdxl_lora.py \
-  --pretrained_model_name_or_path=stabilityai/stable-diffusion-xl-base-1.0  \
-  --pretrained_vae_model_name_or_path=madebyollin/sdxl-vae-fp16-fix \
-  --output_dir="diffusion-sdxl-orpo" \
-  --mixed_precision="fp16" \
-  --dataset_name=kashif/pickascore \
-  --train_batch_size=8 \
-  --gradient_accumulation_steps=2 \
-  --gradient_checkpointing \
-  --use_8bit_adam \
-  --rank=8 \
-  --learning_rate=1e-5 \
-  --report_to="wandb" \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=2000 \
-  --checkpointing_steps=500 \
-  --run_validation --validation_steps=50 \
-  --seed="0" \
-  --report_to="wandb" \
-  --push_to_hub
-```
-
-We also provide a simple script to scale up the training on the [yuvalkirstain/pickapic_v2](https://huggingface.co/datasets/yuvalkirstain/pickapic_v2) dataset:
-
-```bash
-accelerate launch --multi_gpu train_diffusion_orpo_sdxl_lora_wds.py \
-  --pretrained_model_name_or_path=stabilityai/stable-diffusion-xl-base-1.0  \
-  --pretrained_vae_model_name_or_path=madebyollin/sdxl-vae-fp16-fix \
-  --dataset_path="pipe:aws s3 cp s3://diffusion-preference-opt/{00000..00644}.tar -" \
-  --output_dir="diffusion-sdxl-orpo-wds" \
-  --mixed_precision="fp16" \
-  --gradient_accumulation_steps=1 \
-  --gradient_checkpointing \
-  --use_8bit_adam \
-  --rank=8 \
-  --dataloader_num_workers=8 \
-  --learning_rate=3e-5 \
-  --report_to="wandb" \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=50000 \
-  --checkpointing_steps=2000 \
-  --run_validation --validation_steps=500 \
-  --seed="0" \
-  --report_to="wandb" \
-  --push_to_hub
-```
-
-We tested the above on a node of 8 H100s but it should also work on A100s. It requires the `webdataset` library for faster dataloading. Note that we kept the dataset shards on an S3 bucket but it should be also possible to have them stored locally. 
-
-You can use the code below to convert the original dataset into `webdataset` shards:
-
-```python
-import os
-import io
-import ray
-import webdataset as wds
-from datasets import Dataset
-from PIL import Image
-
-ray.init(num_cpus=8)
-
-
-def convert_to_image(im_bytes):
-    return Image.open(io.BytesIO(im_bytes)).convert("RGB")
-
-def main():
-    dataset_path = "/pickapic_v2/data"
-    wds_shards_path = "/pickapic_v2_webdataset"
-    # get all .parquet files in the dataset path
-    dataset_files = [
-        os.path.join(dataset_path, f)
-        for f in os.listdir(dataset_path)
-        if f.endswith(".parquet")
-    ]
-
-    @ray.remote
-    def create_shard(path):
-        # get basename of the file
-        basename = os.path.basename(path)
-        # get the shard number data-00123-of-01034.parquet -> 00123
-        shard_num = basename.split("-")[1]
-        dataset = Dataset.from_parquet(path)
-        # create a webdataset shard
-        shard = wds.TarWriter(os.path.join(wds_shards_path, f"{shard_num}.tar"))
-        
-        for i, example in enumerate(dataset):
-            wds_example = {
-                "__key__": str(i),
-                "original_prompt.txt": example["caption"],
-                "jpg_0.jpg": convert_to_image(example["jpg_0"]),
-                "jpg_1.jpg": convert_to_image(example["jpg_1"]),
-                "label_0.txt": str(example["label_0"]),
-                "label_1.txt": str(example["label_1"])
-            }
-            shard.write(wds_example)
-        shard.close()
-
-    futures = [create_shard.remote(path) for path in dataset_files]
-    ray.get(futures)
-
-
-if __name__ == "__main__":
-    main()
-```
-
-## Inference
-
-Refer to [sayakpaul/diffusion-sdxl-orpo](https://huggingface.co/sayakpaul/diffusion-sdxl-orpo) for an experimental checkpoint.
+This project has a new home now: [https://mapo-t2i.github.io/](https://mapo-t2i.github.io/). We formally studied the use of ORPO in the context of diffusion models and open-sourced our codebase, models, and datasets. We released our paper too!
diff --git a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
@@ -376,6 +376,7 @@ def __call__(
 
         # 2. Define call parameters
         batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
 
         if editing_prompt:
             enable_edit_guidance = True
@@ -405,7 +406,7 @@ def __call__(
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
             text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+        text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         bs_embed, seq_len, _ = text_embeddings.shape
@@ -433,9 +434,9 @@ def __call__(
                         f" {self.tokenizer.model_max_length} tokens: {removed_text}"
                     )
                     edit_concepts_input_ids = edit_concepts_input_ids[:, : self.tokenizer.model_max_length]
-                edit_concepts = self.text_encoder(edit_concepts_input_ids.to(self.device))[0]
+                edit_concepts = self.text_encoder(edit_concepts_input_ids.to(device))[0]
             else:
-                edit_concepts = editing_prompt_embeddings.to(self.device).repeat(batch_size, 1, 1)
+                edit_concepts = editing_prompt_embeddings.to(device).repeat(batch_size, 1, 1)
 
             # duplicate text embeddings for each generation per prompt, using mps friendly method
             bs_embed_edit, seq_len_edit, _ = edit_concepts.shape
@@ -476,7 +477,7 @@ def __call__(
                 truncation=True,
                 return_tensors="pt",
             )
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = uncond_embeddings.shape[1]
@@ -493,7 +494,7 @@ def __call__(
         # get the initial random noise unless the user supplied it
 
         # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
 
         # 5. Prepare latent variables
@@ -504,7 +505,7 @@ def __call__(
             height,
             width,
             text_embeddings.dtype,
-            self.device,
+            device,
             generator,
             latents,
         )
@@ -562,12 +563,12 @@ def __call__(
                 if enable_edit_guidance:
                     concept_weights = torch.zeros(
                         (len(noise_pred_edit_concepts), noise_guidance.shape[0]),
-                        device=self.device,
+                        device=device,
                         dtype=noise_guidance.dtype,
                     )
                     noise_guidance_edit = torch.zeros(
                         (len(noise_pred_edit_concepts), *noise_guidance.shape),
-                        device=self.device,
+                        device=device,
                         dtype=noise_guidance.dtype,
                     )
                     # noise_guidance_edit = torch.zeros_like(noise_guidance)
@@ -644,21 +645,19 @@ def __call__(
 
                         # noise_guidance_edit = noise_guidance_edit + noise_guidance_edit_tmp
 
-                    warmup_inds = torch.tensor(warmup_inds).to(self.device)
+                    warmup_inds = torch.tensor(warmup_inds).to(device)
                     if len(noise_pred_edit_concepts) > warmup_inds.shape[0] > 0:
                         concept_weights = concept_weights.to("cpu")  # Offload to cpu
                         noise_guidance_edit = noise_guidance_edit.to("cpu")
 
-                        concept_weights_tmp = torch.index_select(concept_weights.to(self.device), 0, warmup_inds)
+                        concept_weights_tmp = torch.index_select(concept_weights.to(device), 0, warmup_inds)
                         concept_weights_tmp = torch.where(
                             concept_weights_tmp < 0, torch.zeros_like(concept_weights_tmp), concept_weights_tmp
                         )
                         concept_weights_tmp = concept_weights_tmp / concept_weights_tmp.sum(dim=0)
                         # concept_weights_tmp = torch.nan_to_num(concept_weights_tmp)
 
-                        noise_guidance_edit_tmp = torch.index_select(
-                            noise_guidance_edit.to(self.device), 0, warmup_inds
-                        )
+                        noise_guidance_edit_tmp = torch.index_select(noise_guidance_edit.to(device), 0, warmup_inds)
                         noise_guidance_edit_tmp = torch.einsum(
                             "cb,cbijk->bijk", concept_weights_tmp, noise_guidance_edit_tmp
                         )
@@ -669,8 +668,8 @@ def __call__(
 
                         del noise_guidance_edit_tmp
                         del concept_weights_tmp
-                        concept_weights = concept_weights.to(self.device)
-                        noise_guidance_edit = noise_guidance_edit.to(self.device)
+                        concept_weights = concept_weights.to(device)
+                        noise_guidance_edit = noise_guidance_edit.to(device)
 
                     concept_weights = torch.where(
                         concept_weights < 0, torch.zeros_like(concept_weights), concept_weights
@@ -679,6 +678,7 @@ def __call__(
                     concept_weights = torch.nan_to_num(concept_weights)
 
                     noise_guidance_edit = torch.einsum("cb,cbijk->bijk", concept_weights, noise_guidance_edit)
+                    noise_guidance_edit = noise_guidance_edit.to(edit_momentum.device)
 
                     noise_guidance_edit = noise_guidance_edit + edit_momentum_scale * edit_momentum
 
@@ -689,7 +689,7 @@ def __call__(
                         self.sem_guidance[i] = noise_guidance_edit.detach().cpu()
 
                 if sem_guidance is not None:
-                    edit_guidance = sem_guidance[i].to(self.device)
+                    edit_guidance = sem_guidance[i].to(device)
                     noise_guidance = noise_guidance + edit_guidance
 
                 noise_pred = noise_pred_uncond + noise_guidance
@@ -705,7 +705,7 @@ def __call__(
         # 8. Post-processing
         if not output_type == "latent":
             image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
-            image, has_nsfw_concept = self.run_safety_checker(image, self.device, text_embeddings.dtype)
+            image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
         else:
             image = latents
             has_nsfw_concept = None