diff --git a/src/diffusers/training_utils.py b/src/diffusers/training_utils.py index 082640f37a17..c570bac733db 100644 --- a/src/diffusers/training_utils.py +++ b/src/diffusers/training_utils.py @@ -248,7 +248,13 @@ def _set_state_dict_into_text_encoder( def compute_density_for_timestep_sampling( - weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None + weighting_scheme: str, + batch_size: int, + logit_mean: float = None, + logit_std: float = None, + mode_scale: float = None, + device: Union[torch.device, str] = "cpu", + generator: Optional[torch.Generator] = None, ): """ Compute the density for sampling the timesteps when doing SD3 training. @@ -258,14 +264,13 @@ def compute_density_for_timestep_sampling( SD3 paper reference: https://arxiv.org/abs/2403.03206v1. """ if weighting_scheme == "logit_normal": - # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$). - u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu") + u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device=device, generator=generator) u = torch.nn.functional.sigmoid(u) elif weighting_scheme == "mode": - u = torch.rand(size=(batch_size,), device="cpu") + u = torch.rand(size=(batch_size,), device=device, generator=generator) u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u) else: - u = torch.rand(size=(batch_size,), device="cpu") + u = torch.rand(size=(batch_size,), device=device, generator=generator) return u