From 28d785a95187d3a0295780c94ff52d2cbfbd29a4 Mon Sep 17 00:00:00 2001 From: sunxunle Date: Mon, 20 Jan 2025 16:15:29 +0800 Subject: [PATCH] chore: remove redundant words Signed-off-by: sunxunle --- docs/source/en/api/pipelines/mochi.md | 2 +- scripts/convert_consistency_decoder.py | 2 +- src/diffusers/optimization.py | 2 +- src/diffusers/pipelines/pag/pag_utils.py | 2 +- src/diffusers/video_processor.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/en/api/pipelines/mochi.md b/docs/source/en/api/pipelines/mochi.md index 73b543a51878..ddc66ad23abe 100644 --- a/docs/source/en/api/pipelines/mochi.md +++ b/docs/source/en/api/pipelines/mochi.md @@ -115,7 +115,7 @@ export_to_video(frames, "mochi.mp4", fps=30) ## Reproducing the results from the Genmo Mochi repo -The [Genmo Mochi implementation](https://github.com/genmoai/mochi/tree/main) uses different precision values for each stage in the inference process. The text encoder and VAE use `torch.float32`, while the DiT uses `torch.bfloat16` with the [attention kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html#torch.nn.attention.sdpa_kernel) set to `EFFICIENT_ATTENTION`. Diffusers pipelines currently do not support setting different `dtypes` for different stages of the pipeline. In order to run inference in the same way as the the original implementation, please refer to the following example. +The [Genmo Mochi implementation](https://github.com/genmoai/mochi/tree/main) uses different precision values for each stage in the inference process. The text encoder and VAE use `torch.float32`, while the DiT uses `torch.bfloat16` with the [attention kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html#torch.nn.attention.sdpa_kernel) set to `EFFICIENT_ATTENTION`. Diffusers pipelines currently do not support setting different `dtypes` for different stages of the pipeline. In order to run inference in the same way as the original implementation, please refer to the following example. The original Mochi implementation zeros out empty prompts. However, enabling this option and placing the entire pipeline under autocast can lead to numerical overflows with the T5 text encoder. diff --git a/scripts/convert_consistency_decoder.py b/scripts/convert_consistency_decoder.py index 0cb5fc50dd60..629c784c095a 100644 --- a/scripts/convert_consistency_decoder.py +++ b/scripts/convert_consistency_decoder.py @@ -73,7 +73,7 @@ def _download(url: str, root: str): loop.update(len(buffer)) if insecure_hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256: - raise RuntimeError("Model has been downloaded but the SHA256 checksum does not not match") + raise RuntimeError("Model has been downloaded but the SHA256 checksum does not match") return download_target diff --git a/src/diffusers/optimization.py b/src/diffusers/optimization.py index f20bd94edffa..45d2e92a6d41 100644 --- a/src/diffusers/optimization.py +++ b/src/diffusers/optimization.py @@ -258,7 +258,7 @@ def get_polynomial_decay_schedule_with_warmup( lr_init = optimizer.defaults["lr"] if not (lr_init > lr_end): - raise ValueError(f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})") + raise ValueError(f"lr_end ({lr_end}) must be smaller than initial lr ({lr_init})") def lr_lambda(current_step: int): if current_step < num_warmup_steps: diff --git a/src/diffusers/pipelines/pag/pag_utils.py b/src/diffusers/pipelines/pag/pag_utils.py index 7a6e30a3c6be..4cd2fe4cb79f 100644 --- a/src/diffusers/pipelines/pag/pag_utils.py +++ b/src/diffusers/pipelines/pag/pag_utils.py @@ -158,7 +158,7 @@ def set_pag_applied_layers( ), ): r""" - Set the the self-attention layers to apply PAG. Raise ValueError if the input is invalid. + Set the self-attention layers to apply PAG. Raise ValueError if the input is invalid. Args: pag_applied_layers (`str` or `List[str]`): diff --git a/src/diffusers/video_processor.py b/src/diffusers/video_processor.py index 9e2727b85377..2da782b463d4 100644 --- a/src/diffusers/video_processor.py +++ b/src/diffusers/video_processor.py @@ -67,7 +67,7 @@ def preprocess_video(self, video, height: Optional[int] = None, width: Optional[ # ensure the input is a list of videos: # - if it is a batch of videos (5d torch.Tensor or np.ndarray), it is converted to a list of videos (a list of 4d torch.Tensor or np.ndarray) - # - if it is is a single video, it is convereted to a list of one video. + # - if it is a single video, it is convereted to a list of one video. if isinstance(video, (np.ndarray, torch.Tensor)) and video.ndim == 5: video = list(video) elif isinstance(video, list) and is_valid_image(video[0]) or is_valid_image_imagelist(video):