@@ -687,8 +687,33 @@ def __call__(
687687
688688        Args: 
689689            prompt (`str` or `List[str]`, *optional*): 
690-                 The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.  
690+                 The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds` 
691691                instead. 
692+             negative_prompt (`str` or `List[str]`, *optional*): 
693+                 The prompt or prompts not to guide the image generation. If not defined, one has to pass 
694+                 `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is 
695+                 less than `1`). 
696+             video (`List[PIL.Image.Image]`, *optional*): 
697+                 The input video or videos to be used as a starting point for the generation. The video should be a list 
698+                 of PIL images, a numpy array, or a torch tensor. Currently, the pipeline only supports generating one 
699+                 video at a time. 
700+             mask (`List[PIL.Image.Image]`, *optional*): 
701+                 The input mask defines which video regions to condition on and which to generate. Black areas in the 
702+                 mask indicate conditioning regions, while white areas indicate regions for generation. The mask should 
703+                 be a list of PIL images, a numpy array, or a torch tensor. Currently supports generating a single video 
704+                 at a time. 
705+             reference_images (`List[PIL.Image.Image]`, *optional*): 
706+                 A list of one or more reference images as extra conditioning for the generation. For example, if you 
707+                 are trying to inpaint a video to change the character, you can pass reference images of the new 
708+                 character here. Refer to the Diffusers [examples](https://github.com/huggingface/diffusers/pull/11582) 
709+                 and original [user 
710+                 guide](https://github.com/ali-vilab/VACE/blob/0897c6d055d7d9ea9e191dce763006664d9780f8/UserGuide.md) 
711+                 for a full list of supported tasks and use cases. 
712+             conditioning_scale (`float`, `List[float]`, `torch.Tensor`, defaults to `1.0`): 
713+                 The conditioning scale to be applied when adding the control conditioning latent stream to the 
714+                 denoising latent stream in each control layer of the model. If a float is provided, it will be applied 
715+                 uniformly to all layers. If a list or tensor is provided, it should have the same length as the number 
716+                 of control layers in the model (`len(transformer.config.vace_layers)`). 
692717            height (`int`, defaults to `480`): 
693718                The height in pixels of the generated image. 
694719            width (`int`, defaults to `832`): 
@@ -733,8 +758,9 @@ def __call__(
733758                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list 
734759                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the 
735760                `._callback_tensor_inputs` attribute of your pipeline class. 
736-             autocast_dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`): 
737-                 The dtype to use for the torch.amp.autocast. 
761+             max_sequence_length (`int`, defaults to `512`): 
762+                 The maximum sequence length of the text encoder. If the prompt is longer than this, it will be 
763+                 truncated. If the prompt is shorter, it will be padded to this length. 
738764
739765        Examples: 
740766
0 commit comments