From 99fea2ce32f5266108bf8c038e5725dce330cf8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tolga=20Cang=C3=B6z?= Date: Fri, 18 Oct 2024 11:01:44 +0300 Subject: [PATCH 1/5] [matryoshka.py] Add schedule_shifted_power attribute and update get_schedule_shifted method --- examples/community/matryoshka.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/community/matryoshka.py b/examples/community/matryoshka.py index 7ef1438f7204..e7f96a6ad0eb 100644 --- a/examples/community/matryoshka.py +++ b/examples/community/matryoshka.py @@ -420,6 +420,7 @@ def __init__( self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)) self.scales = None + self.schedule_shifted_power = 1.0 def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ @@ -532,6 +533,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic def get_schedule_shifted(self, alpha_prod, scale_factor=None): if (scale_factor is not None) and (scale_factor > 1): # rescale noise schedule + scale_factor = scale_factor ** self.schedule_shifted_power snr = alpha_prod / (1 - alpha_prod) scaled_snr = snr / scale_factor alpha_prod = 1 / (1 + 1 / scaled_snr) @@ -3816,6 +3818,8 @@ def __init__( if hasattr(unet, "nest_ratio"): scheduler.scales = unet.nest_ratio + [1] + if nesting_level == 2: + scheduler.schedule_shifted_power = 2.0 self.register_modules( text_encoder=text_encoder, From bdd4286b131c45145599d874a999d115b2c5ef74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tolga=20Cang=C3=B6z?= Date: Sun, 20 Oct 2024 15:07:45 +0300 Subject: [PATCH 2/5] Fix `schedule_shifted_power` usage --- examples/community/matryoshka.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/examples/community/matryoshka.py b/examples/community/matryoshka.py index e7f96a6ad0eb..a1638aab4656 100644 --- a/examples/community/matryoshka.py +++ b/examples/community/matryoshka.py @@ -642,16 +642,16 @@ def step( if self.config.thresholding: if len(model_output) > 1: pred_original_sample = [ - self._threshold_sample(p_o_s * scale) / scale - for p_o_s, scale in zip(pred_original_sample, self.scales) + self._threshold_sample(p_o_s) + for p_o_s in pred_original_sample ] else: pred_original_sample = self._threshold_sample(pred_original_sample) elif self.config.clip_sample: if len(model_output) > 1: pred_original_sample = [ - (p_o_s * scale).clamp(-self.config.clip_sample_range, self.config.clip_sample_range) / scale - for p_o_s, scale in zip(pred_original_sample, self.scales) + p_o_s.clamp(-self.config.clip_sample_range, self.config.clip_sample_range) + for p_o_s in pred_original_sample ] else: pred_original_sample = pred_original_sample.clamp( @@ -3846,12 +3846,14 @@ def change_nesting_level(self, nesting_level: int): ).to(self.device) self.config.nesting_level = 1 self.scheduler.scales = self.unet.nest_ratio + [1] + self.scheduler.schedule_shifted_power = 1.0 elif nesting_level == 2: self.unet = NestedUNet2DConditionModel.from_pretrained( "tolgacangoz/matryoshka-diffusion-models", subfolder="unet/nesting_level_2" ).to(self.device) self.config.nesting_level = 2 self.scheduler.scales = self.unet.nest_ratio + [1] + self.scheduler.schedule_shifted_power = 2.0 else: raise ValueError("Currently, nesting levels 0, 1, and 2 are supported.") @@ -4631,8 +4633,8 @@ def __call__( image = latents if self.scheduler.scales is not None: - for i, (img, scale) in enumerate(zip(image, self.scheduler.scales)): - image[i] = self.image_processor.postprocess(img * scale, output_type=output_type)[0] + for i, img in enumerate(image): + image[i] = self.image_processor.postprocess(img, output_type=output_type)[0] else: image = self.image_processor.postprocess(image, output_type=output_type) From dfe0047609843ef36cd35c6a7a617f77ff514537 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tolga=20Cang=C3=B6z?= Date: Sun, 20 Oct 2024 15:10:08 +0300 Subject: [PATCH 3/5] style --- examples/community/matryoshka.py | 7 ++----- src/diffusers/dependency_versions_table.py | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/examples/community/matryoshka.py b/examples/community/matryoshka.py index a1638aab4656..d04b42e8a8ab 100644 --- a/examples/community/matryoshka.py +++ b/examples/community/matryoshka.py @@ -533,7 +533,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic def get_schedule_shifted(self, alpha_prod, scale_factor=None): if (scale_factor is not None) and (scale_factor > 1): # rescale noise schedule - scale_factor = scale_factor ** self.schedule_shifted_power + scale_factor = scale_factor**self.schedule_shifted_power snr = alpha_prod / (1 - alpha_prod) scaled_snr = snr / scale_factor alpha_prod = 1 / (1 + 1 / scaled_snr) @@ -641,10 +641,7 @@ def step( # 4. Clip or threshold "predicted x_0" if self.config.thresholding: if len(model_output) > 1: - pred_original_sample = [ - self._threshold_sample(p_o_s) - for p_o_s in pred_original_sample - ] + pred_original_sample = [self._threshold_sample(p_o_s) for p_o_s in pred_original_sample] else: pred_original_sample = self._threshold_sample(pred_original_sample) elif self.config.clip_sample: diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py index 9e7bf242eca7..0e421b71e48d 100644 --- a/src/diffusers/dependency_versions_table.py +++ b/src/diffusers/dependency_versions_table.py @@ -38,7 +38,7 @@ "regex": "regex!=2019.12.17", "requests": "requests", "tensorboard": "tensorboard", - "torch": "torch>=1.4", + "torch": "torch>=1.4,<2.5.0", "torchvision": "torchvision", "transformers": "transformers>=4.41.2", "urllib3": "urllib3<=2.0.0", From 30d5d8d688bd266fdc4418d64298cb2551321ee0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tolga=20Cang=C3=B6z?= Date: Sun, 20 Oct 2024 15:42:17 +0300 Subject: [PATCH 4/5] Refactor image URLs and remove negative prompt in `README.md` --- examples/community/README.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/examples/community/README.md b/examples/community/README.md index 267c8f4bb904..3f16eda52650 100755 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -4336,19 +4336,19 @@ The Abstract of the paper: **64x64** :-------------------------: -| bird_64 | +| bird_64_64 | - `256×256, nesting_level=1`: 1.776 GiB. With `150` DDIM inference steps: **64x64** | **256x256** :-------------------------:|:-------------------------: -| 64x64 | 256x256 | +| bird_256_64 | bird_256_256 | -- `1024×1024, nesting_level=2`: 1.792 GiB. As one can realize the cost of adding another layer is really negligible. With `250` DDIM inference steps: +- `1024×1024, nesting_level=2`: 1.792 GiB. As one can realize the cost of adding another layer is really negligible in this context! With `250` DDIM inference steps: **64x64** | **256x256** | **1024x1024** :-------------------------:|:-------------------------:|:-------------------------: -| 64x64 | 256x256 | 1024x1024 | +| bird_1024_64 | bird_1024_256 | bird_1024_1024 | ```py from diffusers import DiffusionPipeline @@ -4362,8 +4362,7 @@ pipe = DiffusionPipeline.from_pretrained("tolgacangoz/matryoshka-diffusion-model prompt0 = "a blue jay stops on the top of a helmet of Japanese samurai, background with sakura tree" prompt = f"breathtaking {prompt0}. award-winning, professional, highly detailed" -negative_prompt = "deformed, mutated, ugly, disfigured, blur, blurry, noise, noisy" -image = pipe(prompt=prompt, negative_prompt=negative_prompt, num_inference_steps=50).images +image = pipe(prompt=prompt, num_inference_steps=50).images make_image_grid(image, rows=1, cols=len(image)) # pipe.change_nesting_level() # 0, 1, or 2 From d11290e405d9d1fb8070218588b561c8ccd9a736 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tolga=20Cang=C3=B6z?= Date: Sun, 20 Oct 2024 18:11:37 +0300 Subject: [PATCH 5/5] Refactor comments --- examples/community/README.md | 2 +- examples/community/matryoshka.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/community/README.md b/examples/community/README.md index 3f16eda52650..4f16f65df8fa 100755 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -4362,7 +4362,7 @@ pipe = DiffusionPipeline.from_pretrained("tolgacangoz/matryoshka-diffusion-model prompt0 = "a blue jay stops on the top of a helmet of Japanese samurai, background with sakura tree" prompt = f"breathtaking {prompt0}. award-winning, professional, highly detailed" -image = pipe(prompt=prompt, num_inference_steps=50).images +image = pipe(prompt, num_inference_steps=50).images make_image_grid(image, rows=1, cols=len(image)) # pipe.change_nesting_level() # 0, 1, or 2 diff --git a/examples/community/matryoshka.py b/examples/community/matryoshka.py index d04b42e8a8ab..7ac0ab542910 100644 --- a/examples/community/matryoshka.py +++ b/examples/community/matryoshka.py @@ -107,15 +107,16 @@ >>> # nesting_level=0 -> 64x64; nesting_level=1 -> 256x256 - 64x64; nesting_level=2 -> 1024x1024 - 256x256 - 64x64 >>> pipe = DiffusionPipeline.from_pretrained("tolgacangoz/matryoshka-diffusion-models", - >>> custom_pipeline="matryoshka").to("cuda") + ... nesting_level=0, + ... trust_remote_code=False, # One needs to give permission for this code to run + ... ).to("cuda") >>> prompt0 = "a blue jay stops on the top of a helmet of Japanese samurai, background with sakura tree" >>> prompt = f"breathtaking {prompt0}. award-winning, professional, highly detailed" - >>> negative_prompt = "deformed, mutated, ugly, disfigured, blur, blurry, noise, noisy" - >>> image = pipe(prompt=prompt, negative_prompt=negative_prompt, num_inference_steps=50).images + >>> image = pipe(prompt, num_inference_steps=50).images >>> make_image_grid(image, rows=1, cols=len(image)) - >>> pipe.change_nesting_level() # 0, 1, or 2 + >>> # pipe.change_nesting_level() # 0, 1, or 2 >>> # 50+, 100+, and 250+ num_inference_steps are recommended for nesting levels 0, 1, and 2 respectively. ``` """