Skip to content

Commit 574a138

Browse files
authored
Complete diffusers integration (#96)
* Feat: Complete `diffusers` integration * Update README.md
1 parent 9636fec commit 574a138

File tree

1 file changed

+201
-1
lines changed

1 file changed

+201
-1
lines changed

README.md

Lines changed: 201 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ The demos above showcase 30-second videos generated using our SkyReels-V2 Diffus
4545
- [x] Single-GPU & Multi-GPU Inference Code
4646
- [x] <a href="https://huggingface.co/Skywork/SkyCaptioner-V1">SkyCaptioner-V1</a>: A Video Captioning Model
4747
- [x] Prompt Enhancer
48-
- [ ] Diffusers integration
48+
- [x] Diffusers integration
4949
- [ ] Checkpoints of the 5B Models Series
5050
- [ ] Checkpoints of the Camera Director Models
5151
- [ ] Checkpoints of the Step & Guidance Distill Model
@@ -216,6 +216,92 @@ python3 generate_video_df.py \
216216
--offload
217217
```
218218

219+
Text-to-video with `diffusers`:
220+
```py
221+
import torch
222+
from diffusers import AutoModel, SkyReelsV2DiffusionForcingPipeline, UniPCMultistepScheduler
223+
from diffusers.utils import export_to_video
224+
225+
vae = AutoModel.from_pretrained("Skywork/SkyReels-V2-DF-14B-540P-Diffusers", subfolder="vae", torch_dtype=torch.float32)
226+
227+
pipeline = SkyReelsV2DiffusionForcingPipeline.from_pretrained(
228+
"Skywork/SkyReels-V2-DF-14B-540P-Diffusers",
229+
vae=vae,
230+
torch_dtype=torch.bfloat16
231+
)
232+
flow_shift = 8.0 # 8.0 for T2V, 5.0 for I2V
233+
pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config, flow_shift=flow_shift)
234+
pipeline = pipeline.to("cuda")
235+
236+
prompt = "A cat and a dog baking a cake together in a kitchen. The cat is carefully measuring flour, while the dog is stirring the batter with a wooden spoon. The kitchen is cozy, with sunlight streaming through the window."
237+
238+
output = pipeline(
239+
prompt=prompt,
240+
num_inference_steps=30,
241+
height=544, # 720 for 720P
242+
width=960, # 1280 for 720P
243+
num_frames=97,
244+
base_num_frames=97, # 121 for 720P
245+
ar_step=5, # Controls asynchronous inference (0 for synchronous mode)
246+
causal_block_size=5, # Number of frames in each block for asynchronous processing
247+
overlap_history=None, # Number of frames to overlap for smooth transitions in long videos; 17 for long video generations
248+
addnoise_condition=20, # Improves consistency in long video generation
249+
).frames[0]
250+
export_to_video(output, "T2V.mp4", fps=24, quality=8)
251+
```
252+
253+
Image-to-video with `diffusers`:
254+
```py
255+
import numpy as np
256+
import torch
257+
import torchvision.transforms.functional as TF
258+
from diffusers import AutoencoderKLWan, SkyReelsV2DiffusionForcingImageToVideoPipeline, UniPCMultistepScheduler
259+
from diffusers.utils import export_to_video, load_image
260+
261+
model_id = "Skywork/SkyReels-V2-DF-14B-720P-Diffusers"
262+
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
263+
pipeline = SkyReelsV2DiffusionForcingImageToVideoPipeline.from_pretrained(
264+
model_id, vae=vae, torch_dtype=torch.bfloat16
265+
)
266+
flow_shift = 5.0 # 8.0 for T2V, 5.0 for I2V
267+
pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config, flow_shift=flow_shift)
268+
pipeline.to("cuda")
269+
270+
first_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png")
271+
last_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png")
272+
273+
def aspect_ratio_resize(image, pipeline, max_area=720 * 1280):
274+
aspect_ratio = image.height / image.width
275+
mod_value = pipeline.vae_scale_factor_spatial * pipeline.transformer.config.patch_size[1]
276+
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
277+
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
278+
image = image.resize((width, height))
279+
return image, height, width
280+
281+
def center_crop_resize(image, height, width):
282+
# Calculate resize ratio to match first frame dimensions
283+
resize_ratio = max(width / image.width, height / image.height)
284+
285+
# Resize the image
286+
width = round(image.width * resize_ratio)
287+
height = round(image.height * resize_ratio)
288+
size = [width, height]
289+
image = TF.center_crop(image, size)
290+
291+
return image, height, width
292+
293+
first_frame, height, width = aspect_ratio_resize(first_frame, pipeline)
294+
if last_frame.size != first_frame.size:
295+
last_frame, _, _ = center_crop_resize(last_frame, height, width)
296+
297+
prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective."
298+
299+
output = pipeline(
300+
image=first_frame, last_image=last_frame, prompt=prompt, height=height, width=width, guidance_scale=5.0
301+
).frames[0]
302+
export_to_video(output, "output.mp4", fps=24, quality=8)
303+
```
304+
219305
> **Note**:
220306
> - If you want to run the **image-to-video (I2V)** task, add `--image ${image_path}` to your command and it is also better to use **text-to-video (T2V)**-like prompt which includes some descriptions of the first-frame image.
221307
> - For long video generation, you can just switch the `--num_frames`, e.g., `--num_frames 257` for 10s video, `--num_frames 377` for 15s video, `--num_frames 737` for 30s video, `--num_frames 1457` for 60s video. The number is not strictly aligned with the logical frame number for specified time duration, but it is aligned with some training parameters, which means it may perform better. When you use asynchronous inference with causal_block_size > 1, the `--num_frames` should be carefully set.
@@ -269,6 +355,35 @@ python3 generate_video_df.py \
269355
> **Note**:
270356
> - When controlling the start and end frames, you need to pass the `--image ${image}` parameter to control the generation of the start frame and the `--end_image ${end_image}` parameter to control the generation of the end frame.
271357
358+
Video extension with `diffusers`:
359+
```py
360+
import numpy as np
361+
import torch
362+
import torchvision.transforms.functional as TF
363+
from diffusers import AutoencoderKLWan, SkyReelsV2DiffusionForcingVideoToVideoPipeline, UniPCMultistepScheduler
364+
from diffusers.utils import export_to_video, load_video
365+
366+
model_id = "Skywork/SkyReels-V2-DF-14B-540P-Diffusers"
367+
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
368+
pipeline = SkyReelsV2DiffusionForcingVideoToVideoPipeline.from_pretrained(
369+
model_id, vae=vae, torch_dtype=torch.bfloat16
370+
)
371+
flow_shift = 5.0 # 8.0 for T2V, 5.0 for I2V
372+
pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config, flow_shift=flow_shift)
373+
pipeline.to("cuda")
374+
375+
video = load_video("input_video.mp4")
376+
377+
prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective."
378+
379+
output = pipeline(
380+
video=video, prompt=prompt, height=544, width=960, guidance_scale=5.0,
381+
num_inference_steps=30, num_frames=257, base_num_frames=97#, ar_step=5, causal_block_size=5,
382+
).frames[0]
383+
export_to_video(output, "output.mp4", fps=24, quality=8)
384+
# Total frames will be the number of frames of given video + 257
385+
```
386+
272387
- **Text To Video & Image To Video**
273388

274389
```shell
@@ -291,6 +406,91 @@ python3 generate_video.py \
291406
> - When using an **image-to-video (I2V)** model, you must provide an input image using the `--image ${image_path}` parameter. The `--guidance_scale 5.0` and `--shift 3.0` is recommended for I2V model.
292407
> - Generating a 540P video using the 1.3B model requires approximately 14.7GB peak VRAM, while the same resolution video using the 14B model demands around 43.4GB peak VRAM.
293408
409+
T2V models with `diffusers`:
410+
```py
411+
import torch
412+
from diffusers import (
413+
SkyReelsV2Pipeline,
414+
UniPCMultistepScheduler,
415+
AutoencoderKLWan,
416+
)
417+
from diffusers.utils import export_to_video
418+
419+
# Load the pipeline
420+
# Available models:
421+
# - Skywork/SkyReels-V2-T2V-14B-540P-Diffusers
422+
# - Skywork/SkyReels-V2-T2V-14B-720P-Diffusers
423+
vae = AutoencoderKLWan.from_pretrained(
424+
"Skywork/SkyReels-V2-T2V-14B-720P-Diffusers",
425+
subfolder="vae",
426+
torch_dtype=torch.float32,
427+
)
428+
pipe = SkyReelsV2Pipeline.from_pretrained(
429+
"Skywork/SkyReels-V2-T2V-14B-720P-Diffusers",
430+
vae=vae,
431+
torch_dtype=torch.bfloat16,
432+
)
433+
flow_shift = 8.0 # 8.0 for T2V, 5.0 for I2V
434+
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=flow_shift)
435+
pipe = pipe.to("cuda")
436+
437+
prompt = "A cat and a dog baking a cake together in a kitchen. The cat is carefully measuring flour, while the dog is stirring the batter with a wooden spoon. The kitchen is cozy, with sunlight streaming through the window."
438+
439+
output = pipe(
440+
prompt=prompt,
441+
num_inference_steps=50,
442+
height=544,
443+
width=960,
444+
guidance_scale=6.0, # 6.0 for T2V, 5.0 for I2V
445+
num_frames=97,
446+
).frames[0]
447+
export_to_video(output, "video.mp4", fps=24, quality=8)
448+
```
449+
450+
I2V models with `diffusers`:
451+
```py
452+
import torch
453+
from diffusers import (
454+
SkyReelsV2ImageToVideoPipeline,
455+
UniPCMultistepScheduler,
456+
AutoencoderKLWan,
457+
)
458+
from diffusers.utils import export_to_video
459+
from PIL import Image
460+
461+
# Load the pipeline
462+
# Available models:
463+
# - Skywork/SkyReels-V2-I2V-1.3B-540P-Diffusers
464+
# - Skywork/SkyReels-V2-I2V-14B-540P-Diffusers
465+
# - Skywork/SkyReels-V2-I2V-14B-720P-Diffusers
466+
vae = AutoencoderKLWan.from_pretrained(
467+
"Skywork/SkyReels-V2-I2V-14B-720P-Diffusers",
468+
subfolder="vae",
469+
torch_dtype=torch.float32,
470+
)
471+
pipe = SkyReelsV2ImageToVideoPipeline.from_pretrained(
472+
"Skywork/SkyReels-V2-I2V-14B-720P-Diffusers",
473+
vae=vae,
474+
torch_dtype=torch.bfloat16,
475+
)
476+
flow_shift = 5.0 # 8.0 for T2V, 5.0 for I2V
477+
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=flow_shift)
478+
pipe = pipe.to("cuda")
479+
480+
prompt = "A cat and a dog baking a cake together in a kitchen. The cat is carefully measuring flour, while the dog is stirring the batter with a wooden spoon. The kitchen is cozy, with sunlight streaming through the window."
481+
image = Image.open("path/to/image.png")
482+
483+
output = pipe(
484+
image=image,
485+
prompt=prompt,
486+
num_inference_steps=50,
487+
height=544,
488+
width=960,
489+
guidance_scale=5.0, # 6.0 for T2V, 5.0 for I2V
490+
num_frames=97,
491+
).frames[0]
492+
export_to_video(output, "video.mp4", fps=24, quality=8)
493+
```
294494

295495
- **Prompt Enhancer**
296496

0 commit comments

Comments
 (0)