|
| 1 | +""" |
| 2 | +This is a parallel inference script for CogVideo. The original script |
| 3 | +can be found from the xDiT project at |
| 4 | +
|
| 5 | +https://github.com/xdit-project/xDiT/blob/main/examples/cogvideox_example.py |
| 6 | +
|
| 7 | +By using this code, the inference process is parallelized on multiple GPUs, |
| 8 | +and thus speeded up. |
| 9 | +
|
| 10 | +Usage: |
| 11 | +1. pip install xfuser |
| 12 | +2. mkdir results |
| 13 | +3. run the following command to generate video |
| 14 | +torchrun --nproc_per_node=4 parallel_inference_xdit.py \ |
| 15 | + --model <cogvideox-model-path> --ulysses_degree 1 --ring_degree 2 \ |
| 16 | + --use_cfg_parallel --height 480 --width 720 --num_frames 9 \ |
| 17 | + --prompt 'A small dog.' |
| 18 | +
|
| 19 | +You can also use the run.sh file in the same folder to automate running this |
| 20 | +code for batch generation of videos, by running: |
| 21 | +
|
| 22 | +sh ./run.sh |
| 23 | +
|
| 24 | +""" |
| 25 | + |
| 26 | +import time |
| 27 | +import torch |
| 28 | +import torch.distributed |
| 29 | +from diffusers import AutoencoderKLTemporalDecoder |
| 30 | +from xfuser import xFuserCogVideoXPipeline, xFuserArgs |
| 31 | +from xfuser.config import FlexibleArgumentParser |
| 32 | +from xfuser.core.distributed import ( |
| 33 | + get_world_group, |
| 34 | + get_data_parallel_rank, |
| 35 | + get_data_parallel_world_size, |
| 36 | + get_runtime_state, |
| 37 | + is_dp_last_group, |
| 38 | +) |
| 39 | +from diffusers.utils import export_to_video |
| 40 | + |
| 41 | + |
| 42 | +def main(): |
| 43 | + parser = FlexibleArgumentParser(description="xFuser Arguments") |
| 44 | + args = xFuserArgs.add_cli_args(parser).parse_args() |
| 45 | + engine_args = xFuserArgs.from_cli_args(args) |
| 46 | + |
| 47 | + # Check if ulysses_degree is valid |
| 48 | + num_heads = 30 |
| 49 | + if engine_args.ulysses_degree > 0 and num_heads % engine_args.ulysses_degree != 0: |
| 50 | + raise ValueError( |
| 51 | + f"ulysses_degree ({engine_args.ulysses_degree}) must be a divisor of the number of heads ({num_heads})" |
| 52 | + ) |
| 53 | + |
| 54 | + engine_config, input_config = engine_args.create_config() |
| 55 | + local_rank = get_world_group().local_rank |
| 56 | + |
| 57 | + pipe = xFuserCogVideoXPipeline.from_pretrained( |
| 58 | + pretrained_model_name_or_path=engine_config.model_config.model, |
| 59 | + engine_config=engine_config, |
| 60 | + torch_dtype=torch.bfloat16, |
| 61 | + ) |
| 62 | + if args.enable_sequential_cpu_offload: |
| 63 | + pipe.enable_model_cpu_offload(gpu_id=local_rank) |
| 64 | + pipe.vae.enable_tiling() |
| 65 | + else: |
| 66 | + device = torch.device(f"cuda:{local_rank}") |
| 67 | + pipe = pipe.to(device) |
| 68 | + |
| 69 | + torch.cuda.reset_peak_memory_stats() |
| 70 | + start_time = time.time() |
| 71 | + |
| 72 | + output = pipe( |
| 73 | + height=input_config.height, |
| 74 | + width=input_config.width, |
| 75 | + num_frames=input_config.num_frames, |
| 76 | + prompt=input_config.prompt, |
| 77 | + num_inference_steps=input_config.num_inference_steps, |
| 78 | + generator=torch.Generator(device="cuda").manual_seed(input_config.seed), |
| 79 | + guidance_scale=6, |
| 80 | + ).frames[0] |
| 81 | + |
| 82 | + end_time = time.time() |
| 83 | + elapsed_time = end_time - start_time |
| 84 | + peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") |
| 85 | + |
| 86 | + parallel_info = ( |
| 87 | + f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_" |
| 88 | + f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_" |
| 89 | + f"tp{engine_args.tensor_parallel_degree}_" |
| 90 | + f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}" |
| 91 | + ) |
| 92 | + if is_dp_last_group(): |
| 93 | + world_size = get_data_parallel_world_size() |
| 94 | + resolution = f"{input_config.width}x{input_config.height}" |
| 95 | + output_filename = f"results/cogvideox_{parallel_info}_{resolution}.mp4" |
| 96 | + export_to_video(output, output_filename, fps=8) |
| 97 | + print(f"output saved to {output_filename}") |
| 98 | + |
| 99 | + if get_world_group().rank == get_world_group().world_size - 1: |
| 100 | + print(f"epoch time: {elapsed_time:.2f} sec, memory: {peak_memory/1e9} GB") |
| 101 | + get_runtime_state().destory_distributed_env() |
| 102 | + |
| 103 | + |
| 104 | +if __name__ == "__main__": |
| 105 | + main() |
0 commit comments