diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/README.md b/examples/pytorch/diffusion_model/diffusers/framepack/README.md new file mode 100644 index 00000000000..042b8e109bc --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/framepack/README.md @@ -0,0 +1,57 @@ +# Step-by-Step + +This example quantizes and validates the accuracy of Flux. + +# Prerequisite + +## 1. Environment + +```shell +# install zip according to your system +sudo apt update && sudo apt install zip + +pip install -r requirements.txt +pip install --update neural-compressor-pt +pip install --update auto-round +git clone --depth 1 https://github.com/lllyasviel/FramePack.git +cp -r FramePack/diffusers_helper/ . + +# several models will be downloaded automatically into HF_HOME +export HF_HOME=/path/to/save/model +``` + +## 2. Prepare Dataset + +```shell +git clone --depth 1 https://github.com/Vchitect/VBench.git +cd VBench +sh vbench2_beta_i2v/download_data.sh +``` + +# Run + +## BF16 + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 \ +bash run_benchmark.sh \ + --topology=BF16 \ + --dataset_location=/path/to/VBench \ + --output_video_path=bf16_video \ + --dimension_list=subject_consistency i2v_background \ + --result_path=bf16_result +``` + +## MXFP8 or FP8 + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 \ +bash run_benchmark.sh \ + --topology=MXFP8 \ # or FP8 + --dataset_location=/path/to/VBench \ + --output_video_path=mxfp8_video \ + --dimension_list=subject_consistency i2v_background \ + --result_path=mxfp8_result +``` + +- CUDA_VISIBLE_DEVICES: distribute the dimension_list to different visible GPUs to speed up the evaluation diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/main.py b/examples/pytorch/diffusion_model/diffusers/framepack/main.py new file mode 100644 index 00000000000..51584f2133e --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/framepack/main.py @@ -0,0 +1,347 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import sys +import argparse + +import torch + +from neural_compressor.torch.quantization import ( + AutoRoundConfig, + convert, + prepare, +) +from PIL import Image +from diffusers import AutoencoderKLHunyuanVideo +from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer +from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake +from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp +from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked +from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan +from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete +from transformers import SiglipImageProcessor, SiglipVisionModel +from diffusers_helper.clip_vision import hf_clip_vision_encode +from diffusers_helper.bucket_tools import find_nearest_bucket +import torch +from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked +from auto_round import AutoRound +import json +import torchvision +import torch +import einops +import numpy as np +import argparse + + +parser = argparse.ArgumentParser( + description="FramePack quantization.", formatter_class=argparse.ArgumentDefaultsHelpFormatter +) +parser.add_argument("--scheme", default="MXFP8", type=str, help="quantizaion scheme.") +parser.add_argument("--quantize", action="store_true") +parser.add_argument("--inference", action="store_true") +parser.add_argument("--output_dir", "--quantized_model_path", default="./tmp_autoround", type=str, help="the directory to save quantized model") +parser.add_argument("--dataset_location", type=str, help="path of cloned VBench repository which contains images and prompts for evaluation") +parser.add_argument("--output_video_path", default="./tmp_video", type=str, help="the directory to save generated videos") +parser.add_argument("--limit", default=-1, type=int, help="limit the number of prompts for evaluation") +parser.add_argument("--seed", default=31337, type=int, help="random seed") +parser.add_argument("--total_second_length", default=5, type=int, help="length of generated video") +parser.add_argument("--latent_window_size", default=9, type=int) +parser.add_argument("--steps", default=25, type=float, help="number of inference step") +parser.add_argument("--cfg", default=1.0, type=float, help="real guidance scale") +parser.add_argument("--gs", default=10.0, type=float, help="distilled guidance scale") +parser.add_argument("--rs", default=0.0, type=float, help="guidance rescale") +parser.add_argument("--gpu_memory_preservation", default=6, type=int) +parser.add_argument("--use_teacache", action="store_true", help="faster speed, but often makes hands and fingers slightly worse") +parser.add_argument("--mp4_crf", default=16, type=int, help="MP4 compression. Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs.") +parser.add_argument( + "--dimension_list", + nargs="+", + choices=["subject_consistency", "background_consistency", "motion_smoothness", "dynamic_degree", "aesthetic_quality", "imaging_quality", "i2v_subject", "i2v_background", "camera_motion"], + help="list of evaluation dimensions, usage: --dimension_list ", +) +parser.add_argument("--ratio", default="16-9", type=str, help="aspect ratio of image") + +args = parser.parse_args() +free_mem_gb = get_cuda_free_memory_gb(gpu) +high_vram = free_mem_gb > 60 + +@torch.no_grad() +def worker(input_image, prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf): + input_image = Image.open(input_image).convert("RGB") + input_image = np.array(input_image) + total_latent_sections = (total_second_length * 30) / (latent_window_size * 4) + total_latent_sections = int(max(round(total_latent_sections), 1)) + + # Clean GPU + if not high_vram: + unload_complete_models( + text_encoder, text_encoder_2, image_encoder, vae, transformer + ) + + # Text encoding + + if not high_vram: + fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode. + load_model_as_complete(text_encoder_2, target_device=gpu) + + llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2) + + llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler) + + llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512) + llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512) + # Processing input image + + H, W, C = input_image.shape + height, width = find_nearest_bucket(H, W, resolution=640) + input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height) + + input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1 + input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None] + + # VAE encoding + + if not high_vram: + load_model_as_complete(vae, target_device=gpu) + + start_latent = vae_encode(input_image_pt, vae) + + # CLIP Vision + + if not high_vram: + load_model_as_complete(image_encoder, target_device=gpu) + + image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder) + image_encoder_last_hidden_state = image_encoder_output.last_hidden_state + + # Dtype + + llama_vec = llama_vec.to(transformer.dtype) + llama_vec_n = llama_vec_n.to(transformer.dtype) + clip_l_pooler = clip_l_pooler.to(transformer.dtype) + clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype) + image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype) + + # Sampling + + rnd = torch.Generator("cpu").manual_seed(seed) + num_frames = latent_window_size * 4 - 3 + + history_latents = torch.zeros(size=(1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32).cpu() + history_pixels = None + total_generated_latent_frames = 0 + + latent_paddings = reversed(range(total_latent_sections)) + + if total_latent_sections > 4: + # In theory the latent_paddings should follow the above sequence, but it seems that duplicating some + # items looks better than expanding it when total_latent_sections > 4 + # One can try to remove below trick and just + # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare + latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0] + + for latent_padding in latent_paddings: + is_last_section = latent_padding == 0 + latent_padding_size = latent_padding * latent_window_size + + print(f"latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}") + + indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0) + clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1) + clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1) + + clean_latents_pre = start_latent.to(history_latents) + clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2) + clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2) + + if not high_vram: + unload_complete_models() + move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation) + + + if use_teacache: + transformer.initialize_teacache(enable_teacache=True, num_steps=steps) + else: + transformer.initialize_teacache(enable_teacache=False) + + def callback(d): + preview = d["denoised"] + preview = vae_decode_fake(preview) + + preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8) + preview = einops.rearrange(preview, "b c t h w -> (b h) (t w) c") + + current_step = d["i"] + 1 + hint = f"Sampling {current_step}/{steps}" + desc = f"Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ..." + print(hint, desc) + return + + generated_latents = sample_hunyuan( + transformer=transformer, + sampler="unipc", + width=width, + height=height, + frames=num_frames, + real_guidance_scale=cfg, + distilled_guidance_scale=gs, + guidance_rescale=rs, + # shift=3.0, + num_inference_steps=steps, + generator=rnd, + prompt_embeds=llama_vec, + prompt_embeds_mask=llama_attention_mask, + prompt_poolers=clip_l_pooler, + negative_prompt_embeds=llama_vec_n, + negative_prompt_embeds_mask=llama_attention_mask_n, + negative_prompt_poolers=clip_l_pooler_n, + device=gpu, + dtype=torch.bfloat16, + image_embeddings=image_encoder_last_hidden_state, + latent_indices=latent_indices, + clean_latents=clean_latents, + clean_latent_indices=clean_latent_indices, + clean_latents_2x=clean_latents_2x, + clean_latent_2x_indices=clean_latent_2x_indices, + clean_latents_4x=clean_latents_4x, + clean_latent_4x_indices=clean_latent_4x_indices, + callback=callback, + ) + if is_last_section: + generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2) + + total_generated_latent_frames += int(generated_latents.shape[2]) + history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2) + + if not high_vram: + offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8) + load_model_as_complete(vae, target_device=gpu) + + real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :] + + if history_pixels is None: + history_pixels = vae_decode(real_history_latents, vae).cpu() + else: + section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2) + overlapped_frames = latent_window_size * 4 - 3 + + current_pixels = vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu() + history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames) + + if not high_vram: + unload_complete_models() + + print(f"Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}") + + if is_last_section: + break + return history_pixels + +if __name__ == "__main__": + transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained("lllyasviel/FramePackI2V_HY", torch_dtype=torch.bfloat16).cpu() + transformer.to(dtype=torch.bfloat16) + transformer.requires_grad_(False) + transformer.eval() + + if args.quantize: + setattr(transformer, "name_or_path", "lllyasviel/FramePackI2V_HY") + + qconfig = AutoRoundConfig( + scheme=args.scheme, + iters=0, + export_format="fake", + output_dir=args.output_dir, + ) + transformer = prepare(transformer, qconfig) + transformer = convert(transformer, qconfig) + + if args.inference: + text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="text_encoder", torch_dtype=torch.float16).cpu() + text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="text_encoder_2", torch_dtype=torch.float16).cpu() + tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="tokenizer") + tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="tokenizer_2") + vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="vae", torch_dtype=torch.float16).cpu() + + feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder="feature_extractor") + image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder="image_encoder", torch_dtype=torch.float16).cpu() + + vae.eval() + text_encoder.eval() + text_encoder_2.eval() + image_encoder.eval() + + if not high_vram: + vae.enable_slicing() + vae.enable_tiling() + + transformer.high_quality_fp32_output_for_inference = True + print("transformer.high_quality_fp32_output_for_inference = True") + + vae.to(dtype=torch.float16) + image_encoder.to(dtype=torch.float16) + text_encoder.to(dtype=torch.float16) + text_encoder_2.to(dtype=torch.float16) + + vae.requires_grad_(False) + text_encoder.requires_grad_(False) + text_encoder_2.requires_grad_(False) + image_encoder.requires_grad_(False) + + if not high_vram: + # DynamicSwapInstaller is same as huggingface"s enable_sequential_offload but 3x faster + DynamicSwapInstaller.install_model(text_encoder, device=gpu) + DynamicSwapInstaller.install_model(transformer, device=gpu) + else: + text_encoder.to(gpu) + text_encoder_2.to(gpu) + image_encoder.to(gpu) + vae.to(gpu) + transformer.to(gpu) + + if not os.path.exists(args.output_video_path): + os.makedirs(args.output_video_path) + + idx = 0 + for dimension in args.dimension_list: + # prepare inputs + + image_folder = os.path.join(args.dataset_location, f"vbench2_beta_i2v/data/crop/{args.ratio}") + info_list = json.load(open(os.path.join(args.dataset_location, "vbench2_beta_i2v/vbench2_i2v_full_info.json"), "r")) + inputs = [(os.path.join(image_folder, info["image_name"]), info["prompt_en"]) for info in info_list if dimension in info["dimension"]] + for image_path, prompt in inputs: + idx += 1 + if args.limit > 0 and idx >= args.limit: + break + + # only sample 1 video for each prompt to evaluate quickly + cur_save_path = f"{args.output_video_path}/{prompt}-0.mp4" + + if os.path.exists(cur_save_path): + continue + # perform sampling + x = worker(image_path, prompt, args.seed, args.total_second_length, args.latent_window_size, args.steps, args.cfg, args.gs, args.rs, args.gpu_memory_preservation, args.use_teacache, args.mp4_crf) + b, c, t, h, w = x.shape + + per_row = b + for p in [6, 5, 4, 3, 2]: + if b % p == 0: + per_row = p + break + + x = torch.clamp(x.float(), -1., 1.) * 127.5 + 127.5 + x = x.detach().cpu().to(torch.uint8) + video = einops.rearrange(x, "(m n) c t h w -> t (m h) (n w) c", n=per_row) + torchvision.io.write_video(cur_save_path, video, fps=30, video_codec="h264", options={"crf": "10"}) diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/requirements.txt b/examples/pytorch/diffusion_model/diffusers/framepack/requirements.txt new file mode 100644 index 00000000000..06e80a0af7f --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/framepack/requirements.txt @@ -0,0 +1,35 @@ +Pillow +matplotlib +timm>=0.9,<=1.0.12 +wheel +cython +tensorboard +scipy +scikit-learn +scikit-image +openai-clip +decord +requests +pyyaml +easydict +pyiqa +lvis +fairscale>=0.4.4 +fvcore +easydict +urllib3 +boto3 +omegaconf +transformers +pycocoevalcap +detectron2@git+https://github.com/facebookresearch/detectron2.git +accelerate +diffusers +sentencepiece==0.2.0 +av==12.1.0 +torchsde==0.2.6 +einops +safetensors +opencv-python-headless +dreamsim +numpy<2.0.0 diff --git a/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh new file mode 100644 index 00000000000..71dc61b38d9 --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/framepack/run_benchmark.sh @@ -0,0 +1,130 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --ratio=*) + ratio=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --limit=*) + limit=$(echo $var |cut -f2 -d=) + ;; + --output_video_path=*) + output_video_path=$(echo $var |cut -f2 -d=) + ;; + --result_path=*) + result_path=$(echo $var |cut -f2 -d=) + ;; + --dimension_list=*) + dimension_list=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + + +# run_benchmark +function run_benchmark { + limit=${limit:=-1} + ratio=${ratio:="16-9"} + output_video_path=${output_video_path:="./tmp_videos"} + result_path=${result_path:="./eval_result"} + + if [[ ! "${result_path}" = /* ]]; then + result_path=$(realpath -s "$(pwd)/$result_path") + fi + + if [[ ! "${output_video_path}" = /* ]]; then + output_video_path=$(realpath -s "$(pwd)/$output_video_path") + fi + + if [ "${topology}" = "FP8" ]; then + extra_cmd="--scheme FP8 --quantize --inference" + elif [ "${topology}" = "MXFP8" ]; then + extra_cmd="--scheme MXFP8 --quantize --inference" + elif [ "${topology}" = "BF16" ]; then + extra_cmd="--inference" + fi + + if [ -n "$CUDA_VISIBLE_DEVICES" ]; then + gpu_list="${CUDA_VISIBLE_DEVICES:-}" + IFS=',' read -ra gpu_ids <<< "$gpu_list" + visible_gpus=${#gpu_ids[@]} + echo "visible_gpus: ${visible_gpus}" + + IFS=' ' read -ra dimensions <<< "$dimension_list" + dimension_num=${#dimensions[@]} + if [ "${visible_gpus}" -gt "${dimension_num}" ]; then + count=${dimension_num} + step=1 + else + count=${visible_gpus} + step=$((dimension_num/visible_gpus)) + left=${dimensions[@]:step*count-1:dimension_num} + dimensions=("${dimensions[@]:0:step*count-1}" "$left") + fi + + for ((i=0; i&1) + result_file=$(echo "$output" | grep -i "Evaluation results saved to " | awk '{print $NF}') + + echo "Evaluation results saved to ${result_file}" + zip -r "${result_path}.zip" ${result_path} + python scripts/cal_i2v_final_score.py --zip_file "${result_path}.zip" --model_name "framepack" + +} + +main "$@" + diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index 5fa3b253cfa..2342f9f5b84 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -268,7 +268,6 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): if tokenizer is not None: delattr(model.orig_model, "tokenizer") elif pipe is None: - tokenizer = "Placeholder" self.dataset = CapturedDataloader(model.args_list, model.kwargs_list) model = model.orig_model if pipe is not None: