diff --git a/ppdiffusers/examples/pixart_quant/example/pixart/configs/w8a8.yaml b/ppdiffusers/examples/pixart_quant/example/pixart/configs/w8a8.yaml new file mode 100755 index 000000000..8613998c8 --- /dev/null +++ b/ppdiffusers/examples/pixart_quant/example/pixart/configs/w8a8.yaml @@ -0,0 +1,25 @@ +model: + model_id: pixart_sigma +remain_fp_regex: adaln_single|caption_projection|proj_out +calib_data: + save_path: ./calib_data.pth +#quarot: + #layer_name_regex: ff.net|to_v|to_k|to_q +# smooth_quant: +# alpha: 0.99 +# layer_name_regex: ff\.net|to_v|to_k|to_q +# viditq: +# alpha: 0.99 +# layer_name_regex: attn|ff +weight: + n_bits: 8 + sym: true +act: + n_bits: 8 + sym: true +#mixed_precision: # the unmentioned layers are remained the main bitwidth (n_bits[i_bitwidth]) + #weight: + #layer_name_regex: ['','to_v|to_k|to_q|ff.net.2',''] # [FP, [n_bits]] + #act: + #layer_name_regex: ['','to_v|to_k|to_q|ff.net.2',''] + diff --git a/ppdiffusers/examples/pixart_quant/example/pixart/get_calib_data.py b/ppdiffusers/examples/pixart_quant/example/pixart/get_calib_data.py new file mode 100755 index 000000000..938b0f9e9 --- /dev/null +++ b/ppdiffusers/examples/pixart_quant/example/pixart/get_calib_data.py @@ -0,0 +1,166 @@ +import os +import sys +import time +import shutil +import argparse +import logging +import ppdiffusers + +import paddle +import paddle.nn as nn +from qdiff.utils import apply_func_to_submodules, seed_everything, setup_logging + +from models.customize_pixart_alpha_pipeline import CustomizePixArtAlphaPipeline +from models.customize_transformer_2d import CustomizeTransformer2DModel + +ppdiffusers.models.Transformer2DModel = CustomizeTransformer2DModel +ppdiffusers.PixArtAlphaPipeline = CustomizePixArtAlphaPipeline +from ppdiffusers import PixArtAlphaPipeline +from omegaconf import OmegaConf, ListConfig + +class SaveActivationHook: + def __init__(self): + self.hook_handle = None + self.outputs = [] + + def __call__(self, layer, inputs): + """ + Paddle 前向前钩子 (forward_pre_hook) 的签名通常是 (layer, inputs) + inputs 通常是一个 tuple,inputs[0] 是实际的输入 Tensor。 + + 支持输入形状 [BS, C] 或 [BS, N_token, C], + 仅保留通道维的最大值(以减小存储)。 + """ + # 取第一个输入(通常是我们需要的 Tensor) + x = inputs[0] if isinstance(inputs, (tuple, list)) else inputs + + # 保证是 Paddle Tensor + if x is None: + return + + # 取 channel 维度大小 + C = x.shape[-1] + + # 将前两维合并后取每列的绝对值最大值 -> shape [C] + data = paddle.abs(x.reshape([-1, C])).max(axis=0) + + # 保存 + self.outputs.append(data) + + def clear(self): + self.outputs = [] + +def add_hook_to_module_(module, hook_cls, **kwargs): + """ + 注册一个 paddle 钩子到 module 并返回 hook 实例。 + - module: paddle.nn.Layer + - hook_cls: 钩子类(实例的 __call__ 应兼容 Paddle 的钩子签名) + - when: 'pre' or 'post'(默认 'pre') + """ + hook = hook_cls() + handle = module.register_forward_pre_hook(hook) + hook.hook_handle = handle + return hook + +def main(args): + seed_everything(args.seed) + paddle.set_grad_enabled(False) + device = "gpu" if paddle.is_compiled_with_cuda() else "cpu" + + if args.log is not None: + if not os.path.exists(args.log): + os.makedirs(args.log) + log_file = os.path.join(args.log, 'run.log') + setup_logging(log_file) + logger = logging.getLogger(__name__) + + pipe = PixArtAlphaPipeline.from_pretrained("/mnt/public/wujunyi_tsinghua/huggingface_cache/hub/models--PixArt-alpha--PixArt-XL-2-1024-MS/snapshots/b89adadeccd9ead2adcb9fa2825d3fabec48d404", from_diffusers=True, from_hf_hub=True) + + # ---- assign quant configs ------ + quant_config = OmegaConf.load(args.quant_config) + pipe.convert_quant(quant_config) + pipe = pipe.to(dtype=paddle.float16).to(device) + model = pipe.transformer + + ''' + INFO: add the hook for hooking the activations + ''' + kwargs = { + 'hook_cls': SaveActivationHook, + } + hook_d = apply_func_to_submodules(model, + class_type=nn.Linear, # add hook to all objects of this cls + function=add_hook_to_module_, + return_d={}, + **kwargs + ) + + # read the promts + prompt_path = args.prompt if args.prompt is not None else "./prompts.txt" + prompts = [] + with open(prompt_path, 'r') as f: + lines = f.readlines() + for line in lines: + prompts.append(line.strip()) + + N_batch = len(prompts) // args.batch_size # drop_last + for i in range(N_batch): + images = pipe( + prompt=prompts[i*args.batch_size: (i+1)*args.batch_size], + num_inference_steps=args.num_sampling_steps + ).images + + save_d = {} + for k, v in hook_d.items(): + # 如果没有采集到任何 activation,跳过并记录警告 + if not getattr(v, "outputs", None): + logger.warning(f'layer_name: {k} has no saved outputs, skipping.') + continue + + # 将 list of paddle.Tensor ([C]) -> stacked Tensor shape [N_timestep*B, C] + save_d[k] = paddle.stack(v.outputs, axis=0) + + # logging: v.outputs[0].shape 在 Paddle 中是 tuple,格式化打印也没问题 + logger.info(f'layer_name: {k}, hook_input_shape: {v.outputs[0].shape}') + + # 安全移除 hook(兼容不同 Paddle 版本) + handle = getattr(v, "hook_handle", None) + if handle is not None: + try: + # 新版本可能提供可调用的 handle.remove() + handle.remove() + except Exception: + # 退而求其次:尝试从 module 的私有 hook dict 中移除(如果 hook 保存了 module 引用) + try: + module = getattr(v, "module", None) + if module is not None: + # 可能是 pre 或 post 钩子 id + if hasattr(module, "_forward_pre_hooks"): + module._forward_pre_hooks.pop(handle, None) + if hasattr(module, "_forward_post_hooks"): + module._forward_post_hooks.pop(handle, None) + else: + # 无 module 引用时无法进一步移除(记录警告) + logger.warning(f"hook handle for {k} could not be removed automatically (no module reference).") + except Exception as e: + logger.warning(f"failed to remove hook for {k}: {e}") + else: + logger.warning(f'no hook_handle found for {k}') + + # 保存到文件(Paddle 的保存格式) + save_path = os.path.join(args.log, quant_config.calib_data.save_path) + paddle.save(save_d, save_path) + logger.info(f'saved calib data in {save_path}') + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--log", type=str) + parser.add_argument("--cfg-scale", type=float, default=4.0) + parser.add_argument('--quant-config', required=True, type=str) + parser.add_argument("--num-sampling-steps", type=int, default=10) + parser.add_argument("--prompt", type=str, default=None) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--batch-size", type=int, default=4) + parser.add_argument("--ckpt", type=str, default=None) + args = parser.parse_args() + main(args) diff --git a/ppdiffusers/examples/pixart_quant/example/pixart/main.sh b/ppdiffusers/examples/pixart_quant/example/pixart/main.sh new file mode 100755 index 000000000..8507ec1ea --- /dev/null +++ b/ppdiffusers/examples/pixart_quant/example/pixart/main.sh @@ -0,0 +1,9 @@ +LOG='fp16_1024' +CFG='w8a8.yaml' +PROMPT_PATH='./samples_16.txt' +GPU_ID=2 + +# CUDA_VISIBLE_DEVICES=$GPU_ID python get_calib_data.py --quant-config "./configs/${CFG}" --log "./logs/${LOG}" --prompt $PROMPT_PATH + +# CUDA_VISIBLE_DEVICES=$GPU_ID python ptq.py --quant-config "./configs/${CFG}" --log "./logs/${LOG}" +CUDA_VISIBLE_DEVICES=$GPU_ID python quant_inference.py --quant-config "./configs/${CFG}" --log "./logs/${LOG}" \ No newline at end of file diff --git a/ppdiffusers/examples/pixart_quant/example/pixart/models/customize_pixart_alpha_pipeline.py b/ppdiffusers/examples/pixart_quant/example/pixart/models/customize_pixart_alpha_pipeline.py new file mode 100644 index 000000000..456473f68 --- /dev/null +++ b/ppdiffusers/examples/pixart_quant/example/pixart/models/customize_pixart_alpha_pipeline.py @@ -0,0 +1,873 @@ +# Copyright 2023 PixArt-Alpha Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import html +import inspect +import re +import urllib.parse as ul +from typing import Callable, List, Optional, Tuple, Union + +import paddle +import paddle.nn.functional as F + +from ppdiffusers.transformers import T5EncoderModel, T5Tokenizer + +from ppdiffusers.image_processor import VaeImageProcessor +from ppdiffusers.models import AutoencoderKL, Transformer2DModel +from ppdiffusers.schedulers import DPMSolverMultistepScheduler +from ppdiffusers.utils import ( + BACKENDS_MAPPING, + deprecate, + is_bs4_available, + is_ftfy_available, + logging, + replace_example_docstring, +) +from ppdiffusers.utils.paddle_utils import randn_tensor +from ppdiffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput +from models.customize_transformer_2d import CustomizeTransformer2DModel + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +if is_bs4_available(): + from bs4 import BeautifulSoup + +if is_ftfy_available(): + import ftfy + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import paddle + >>> from ppdiffusers import PixArtAlphaPipeline + + >>> # You can replace the checkpoint id with "PixArt-alpha/PixArt-XL-2-512x512" too. + >>> pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", paddle_dtype=paddle.float16) + + >>> prompt = "A small cactus with a happy face in the Sahara desert." + >>> image = pipe(prompt).images[0] + ``` +""" + +ASPECT_RATIO_1024_BIN = { + "0.25": [512.0, 2048.0], + "0.28": [512.0, 1856.0], + "0.32": [576.0, 1792.0], + "0.33": [576.0, 1728.0], + "0.35": [576.0, 1664.0], + "0.4": [640.0, 1600.0], + "0.42": [640.0, 1536.0], + "0.48": [704.0, 1472.0], + "0.5": [704.0, 1408.0], + "0.52": [704.0, 1344.0], + "0.57": [768.0, 1344.0], + "0.6": [768.0, 1280.0], + "0.68": [832.0, 1216.0], + "0.72": [832.0, 1152.0], + "0.78": [896.0, 1152.0], + "0.82": [896.0, 1088.0], + "0.88": [960.0, 1088.0], + "0.94": [960.0, 1024.0], + "1.0": [1024.0, 1024.0], + "1.07": [1024.0, 960.0], + "1.13": [1088.0, 960.0], + "1.21": [1088.0, 896.0], + "1.29": [1152.0, 896.0], + "1.38": [1152.0, 832.0], + "1.46": [1216.0, 832.0], + "1.67": [1280.0, 768.0], + "1.75": [1344.0, 768.0], + "2.0": [1408.0, 704.0], + "2.09": [1472.0, 704.0], + "2.4": [1536.0, 640.0], + "2.5": [1600.0, 640.0], + "3.0": [1728.0, 576.0], + "4.0": [2048.0, 512.0], +} + +ASPECT_RATIO_512_BIN = { + "0.25": [256.0, 1024.0], + "0.28": [256.0, 928.0], + "0.32": [288.0, 896.0], + "0.33": [288.0, 864.0], + "0.35": [288.0, 832.0], + "0.4": [320.0, 800.0], + "0.42": [320.0, 768.0], + "0.48": [352.0, 736.0], + "0.5": [352.0, 704.0], + "0.52": [352.0, 672.0], + "0.57": [384.0, 672.0], + "0.6": [384.0, 640.0], + "0.68": [416.0, 608.0], + "0.72": [416.0, 576.0], + "0.78": [448.0, 576.0], + "0.82": [448.0, 544.0], + "0.88": [480.0, 544.0], + "0.94": [480.0, 512.0], + "1.0": [512.0, 512.0], + "1.07": [512.0, 480.0], + "1.13": [544.0, 480.0], + "1.21": [544.0, 448.0], + "1.29": [576.0, 448.0], + "1.38": [576.0, 416.0], + "1.46": [608.0, 416.0], + "1.67": [640.0, 384.0], + "1.75": [672.0, 384.0], + "2.0": [704.0, 352.0], + "2.09": [736.0, 352.0], + "2.4": [768.0, 320.0], + "2.5": [800.0, 320.0], + "3.0": [864.0, 288.0], + "4.0": [1024.0, 256.0], +} + + +class CustomizePixArtAlphaPipeline(DiffusionPipeline): + r""" + Pipeline for text-to-image generation using PixArt-Alpha. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`T5EncoderModel`]): + Frozen text-encoder. PixArt-Alpha uses + [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the + [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant. + tokenizer (`T5Tokenizer`): + Tokenizer of class + [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer). + transformer ([`Transformer2DModel`]): + A text conditioned `Transformer2DModel` to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `transformer` to denoise the encoded image latents. + """ + + bad_punct_regex = re.compile( + r"[" + + "#®•©™&@·º½¾¿¡§~" + + r"\)" + + r"\(" + + r"\]" + + r"\[" + + r"\}" + + r"\{" + + r"\|" + + "\\" + + r"\/" + + r"\*" + + r"]{1,}" + ) # noqa + + _optional_components = ["tokenizer", "text_encoder"] + model_cpu_offload_seq = "text_encoder->transformer->vae" + + def __init__( + self, + tokenizer: T5Tokenizer, + text_encoder: T5EncoderModel, + vae: AutoencoderKL, + transformer: CustomizeTransformer2DModel, + scheduler: DPMSolverMultistepScheduler, + ): + super().__init__() + + self.register_modules( + tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler + ) + + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + + def convert_quant(self, quant_config): + + self.quant_config = quant_config + self.transformer.convert_quant(self.quant_config) + + # Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/utils.py + def mask_text_embeddings(self, emb, mask): + if emb.shape[0] == 1: + keep_index = mask.sum().item() + return emb[:, :, :keep_index, :], keep_index + else: + masked_feature = emb * mask[:, None, :, None] + return masked_feature, emb.shape[2] + + # Adapted from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt + def encode_prompt( + self, + prompt: Union[str, List[str]], + do_classifier_free_guidance: bool = True, + negative_prompt: str = "", + num_images_per_prompt: int = 1, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + prompt_attention_mask: Optional[paddle.Tensor] = None, + negative_prompt_attention_mask: Optional[paddle.Tensor] = None, + clean_caption: bool = False, + **kwargs, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + negative_prompt (`str` or `List[str]`, *optional*): + The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` + instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For + PixArt-Alpha, this should be "". + do_classifier_free_guidance (`bool`, *optional*, defaults to `True`): + whether to use classifier free guidance or not + num_images_per_prompt (`int`, *optional*, defaults to 1): + number of images that should be generated per prompt + prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the "" + string. + clean_caption (bool, defaults to `False`): + If `True`, the function will preprocess and clean the provided caption before encoding. + """ + + if "mask_feature" in kwargs: + deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version." + deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False) + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + # See Section 3.1. of the paper. + max_length = 120 + + if prompt_embeds is None: + prompt = self._text_preprocessing(prompt, clean_caption=clean_caption) + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=max_length, + truncation=True, + add_special_tokens=True, + return_tensors="pd", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {max_length} tokens: {removed_text}" + ) + + prompt_attention_mask = text_inputs.attention_mask + + prompt_embeds = self.text_encoder(text_input_ids, attention_mask=prompt_attention_mask) + prompt_embeds = prompt_embeds[0] + + if self.text_encoder is not None: + dtype = self.text_encoder.dtype + elif self.transformer is not None: + dtype = self.transformer.dtype + else: + dtype = None + + prompt_embeds = prompt_embeds.cast(dtype=dtype) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_attention_mask = prompt_attention_mask.reshape([bs_embed, -1]) + prompt_attention_mask = prompt_attention_mask.tile([num_images_per_prompt, 1]) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens = [negative_prompt] * batch_size + uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption) + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_attention_mask=True, + add_special_tokens=True, + return_tensors="pd", + ) + negative_prompt_attention_mask = uncond_input.attention_mask + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids, attention_mask=negative_prompt_attention_mask + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.cast(dtype=dtype) + + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) + + negative_prompt_attention_mask = negative_prompt_attention_mask.reshape([bs_embed, -1]) + negative_prompt_attention_mask = negative_prompt_attention_mask.tile([num_images_per_prompt, 1]) + else: + negative_prompt_embeds = None + negative_prompt_attention_mask = None + + return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask + + # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def check_inputs( + self, + prompt, + height, + width, + negative_prompt, + callback_steps, + prompt_embeds=None, + negative_prompt_embeds=None, + prompt_attention_mask=None, + negative_prompt_attention_mask=None, + ): + if height % 8 != 0 or width % 8 != 0: + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") + + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and prompt_attention_mask is None: + raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.") + + if negative_prompt_embeds is not None and negative_prompt_attention_mask is None: + raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.") + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + if prompt_attention_mask.shape != negative_prompt_attention_mask.shape: + raise ValueError( + "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but" + f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`" + f" {negative_prompt_attention_mask.shape}." + ) + + # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing + def _text_preprocessing(self, text, clean_caption=False): + if clean_caption and not is_bs4_available(): + logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`")) + logger.warn("Setting `clean_caption` to False...") + clean_caption = False + + if clean_caption and not is_ftfy_available(): + logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`")) + logger.warn("Setting `clean_caption` to False...") + clean_caption = False + + if not isinstance(text, (tuple, list)): + text = [text] + + def process(text: str): + if clean_caption: + text = self._clean_caption(text) + text = self._clean_caption(text) + else: + text = text.lower().strip() + return text + + return [process(t) for t in text] + + # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption + def _clean_caption(self, caption): + caption = str(caption) + caption = ul.unquote_plus(caption) + caption = caption.strip().lower() + caption = re.sub("", "person", caption) + # urls: + caption = re.sub( + r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa + "", + caption, + ) # regex for urls + caption = re.sub( + r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa + "", + caption, + ) # regex for urls + # html: + caption = BeautifulSoup(caption, features="html.parser").text + + # @ + caption = re.sub(r"@[\w\d]+\b", "", caption) + + # 31C0—31EF CJK Strokes + # 31F0—31FF Katakana Phonetic Extensions + # 3200—32FF Enclosed CJK Letters and Months + # 3300—33FF CJK Compatibility + # 3400—4DBF CJK Unified Ideographs Extension A + # 4DC0—4DFF Yijing Hexagram Symbols + # 4E00—9FFF CJK Unified Ideographs + caption = re.sub(r"[\u31c0-\u31ef]+", "", caption) + caption = re.sub(r"[\u31f0-\u31ff]+", "", caption) + caption = re.sub(r"[\u3200-\u32ff]+", "", caption) + caption = re.sub(r"[\u3300-\u33ff]+", "", caption) + caption = re.sub(r"[\u3400-\u4dbf]+", "", caption) + caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption) + caption = re.sub(r"[\u4e00-\u9fff]+", "", caption) + ####################################################### + + # все виды тире / all types of dash --> "-" + caption = re.sub( + r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa + "-", + caption, + ) + + # кавычки к одному стандарту + caption = re.sub(r"[`´«»“”¨]", '"', caption) + caption = re.sub(r"[‘’]", "'", caption) + + # " + caption = re.sub(r""?", "", caption) + # & + caption = re.sub(r"&", "", caption) + + # ip adresses: + caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption) + + # article ids: + caption = re.sub(r"\d:\d\d\s+$", "", caption) + + # \n + caption = re.sub(r"\\n", " ", caption) + + # "#123" + caption = re.sub(r"#\d{1,3}\b", "", caption) + # "#12345.." + caption = re.sub(r"#\d{5,}\b", "", caption) + # "123456.." + caption = re.sub(r"\b\d{6,}\b", "", caption) + # filenames: + caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption) + + # + caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT""" + caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT""" + + caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT + caption = re.sub(r"\s+\.\s+", r" ", caption) # " . " + + # this-is-my-cute-cat / this_is_my_cute_cat + regex2 = re.compile(r"(?:\-|\_)") + if len(re.findall(regex2, caption)) > 3: + caption = re.sub(regex2, " ", caption) + + caption = ftfy.fix_text(caption) + caption = html.unescape(html.unescape(caption)) + + caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640 + caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc + caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231 + + caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption) + caption = re.sub(r"(free\s)?download(\sfree)?", "", caption) + caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption) + caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption) + caption = re.sub(r"\bpage\s+\d+\b", "", caption) + + caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a... + + caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption) + + caption = re.sub(r"\b\s+\:\s+", r": ", caption) + caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption) + caption = re.sub(r"\s+", " ", caption) + + caption.strip() + + caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption) + caption = re.sub(r"^[\'\_,\-\:;]", r"", caption) + caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption) + caption = re.sub(r"^\.\S+$", "", caption) + + return caption.strip() + + # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents + def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None): + shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor(shape, generator=generator, dtype=dtype) + else: + latents = latents.cast(dtype) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + @staticmethod + def classify_height_width_bin(height: int, width: int, ratios: dict) -> Tuple[int, int]: + """Returns binned height and width.""" + ar = float(height / width) + closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar)) + default_hw = ratios[closest_ratio] + return int(default_hw[0]), int(default_hw[1]) + + @staticmethod + def resize_and_crop_tensor(samples: paddle.Tensor, new_width: int, new_height: int) -> paddle.Tensor: + orig_height, orig_width = samples.shape[2], samples.shape[3] + + # Check if resizing is needed + if orig_height != new_height or orig_width != new_width: + ratio = max(new_height / orig_height, new_width / orig_width) + resized_width = int(orig_width * ratio) + resized_height = int(orig_height * ratio) + + # Resize + samples = F.interpolate( + samples, size=(resized_height, resized_width), mode="bilinear", align_corners=False + ) + + # Center Crop + start_x = (resized_width - new_width) // 2 + end_x = start_x + new_width + start_y = (resized_height - new_height) // 2 + end_y = start_y + new_height + samples = samples[:, :, start_y:end_y, start_x:end_x] + + return samples + + @paddle.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]] = None, + negative_prompt: str = "", + num_inference_steps: int = 20, + timesteps: List[int] = None, + guidance_scale: float = 4.5, + num_images_per_prompt: Optional[int] = 1, + height: Optional[int] = None, + width: Optional[int] = None, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + prompt_attention_mask: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_attention_mask: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: int = 1, + clean_caption: bool = True, + use_resolution_binning: bool = True, + **kwargs, + ) -> Union[ImagePipelineOutput, Tuple]: + """ + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + num_inference_steps (`int`, *optional*, defaults to 100): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps` + timesteps are used. Must be in descending order. + guidance_scale (`float`, *optional*, defaults to 4.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + height (`int`, *optional*, defaults to self.unet.config.sample_size): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to self.unet.config.sample_size): + The width in pixels of the generated image. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*): + One or a list of [paddle generator(s)] to make generation deterministic. + latents (`paddle.Tensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + prompt_attention_mask (`paddle.Tensor`, *optional*): Pre-generated attention mask for text embeddings. + negative_prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated negative text embeddings. For PixArt-Alpha this negative prompt should be "". If not + provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + negative_prompt_attention_mask (`paddle.Tensor`, *optional*): + Pre-generated attention mask for negative text embeddings. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + clean_caption (`bool`, *optional*, defaults to `True`): + Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to + be installed. If the dependencies are not installed, the embeddings will be created from the raw + prompt. + use_resolution_binning (`bool` defaults to `True`): + If set to `True`, the requested height and width are first mapped to the closest resolutions using + `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to + the requested resolution. Useful for generating non-square images. + + Examples: + + Returns: + [`~pipelines.ImagePipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is + returned where the first element is a list with the generated images + """ + if "mask_feature" in kwargs: + deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version." + deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False) + # 1. Check inputs. Raise error if not correct + height = height or self.transformer.config.sample_size * self.vae_scale_factor + width = width or self.transformer.config.sample_size * self.vae_scale_factor + if use_resolution_binning: + aspect_ratio_bin = ( + ASPECT_RATIO_1024_BIN if self.transformer.config.sample_size == 128 else ASPECT_RATIO_512_BIN + ) + orig_height, orig_width = height, width + height, width = self.classify_height_width_bin(height, width, ratios=aspect_ratio_bin) + + self.check_inputs( + prompt, + height, + width, + negative_prompt, + callback_steps, + prompt_embeds, + negative_prompt_embeds, + prompt_attention_mask, + negative_prompt_attention_mask, + ) + + # 2. Default height and width to transformer + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + ( + prompt_embeds, + prompt_attention_mask, + negative_prompt_embeds, + negative_prompt_attention_mask, + ) = self.encode_prompt( + prompt, + do_classifier_free_guidance, + negative_prompt=negative_prompt, + num_images_per_prompt=num_images_per_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + prompt_attention_mask=prompt_attention_mask, + negative_prompt_attention_mask=negative_prompt_attention_mask, + clean_caption=clean_caption, + ) + if do_classifier_free_guidance: + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds], axis=0) + prompt_attention_mask = paddle.concat([negative_prompt_attention_mask, prompt_attention_mask], axis=0) + + # 4. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps) + timesteps = self.scheduler.timesteps + + # 5. Prepare latents. + latent_channels = self.transformer.config.in_channels + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + latent_channels, + height, + width, + prompt_embeds.dtype, + generator, + latents, + ) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 6.1 Prepare micro-conditions. + added_cond_kwargs = {"resolution": None, "aspect_ratio": None} + if self.transformer.config.sample_size == 128: + resolution = paddle.to_tensor([height, width]).tile([batch_size * num_images_per_prompt, 1]) + aspect_ratio = paddle.to_tensor([float(height / width)]).tile([batch_size * num_images_per_prompt, 1]) + resolution = resolution.cast(dtype=prompt_embeds.dtype) + aspect_ratio = aspect_ratio.cast(dtype=prompt_embeds.dtype) + added_cond_kwargs = {"resolution": resolution, "aspect_ratio": aspect_ratio} + + # 7. Denoising loop + num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) + + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + current_timestep = t + if not paddle.is_tensor(current_timestep): + if isinstance(current_timestep, float): + dtype = paddle.float32 + else: + dtype = paddle.int64 + current_timestep = paddle.to_tensor([current_timestep], dtype=dtype) + elif len(current_timestep.shape) == 0: + current_timestep = current_timestep[None] + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + current_timestep = current_timestep.expand( + [ + latent_model_input.shape[0], + ] + ) + + # predict noise model_output + noise_pred = self.transformer( + latent_model_input, + encoder_hidden_states=prompt_embeds, + encoder_attention_mask=prompt_attention_mask, + timestep=current_timestep, + added_cond_kwargs=added_cond_kwargs, + return_dict=False, + )[0] + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # learned sigma + if self.transformer.config.out_channels // 2 == latent_channels: + noise_pred = noise_pred.chunk(2, axis=1)[0] + else: + noise_pred = noise_pred + + # compute previous image: x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) + + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] + if use_resolution_binning: + image = self.resize_and_crop_tensor(image, orig_width, orig_height) + else: + image = latents + + if not output_type == "latent": + image = self.image_processor.postprocess(image, output_type=output_type) + + if not return_dict: + return (image,) + + return ImagePipelineOutput(images=image) diff --git a/ppdiffusers/examples/pixart_quant/example/pixart/models/customize_transformer_2d.py b/ppdiffusers/examples/pixart_quant/example/pixart/models/customize_transformer_2d.py new file mode 100644 index 000000000..adbc80c38 --- /dev/null +++ b/ppdiffusers/examples/pixart_quant/example/pixart/models/customize_transformer_2d.py @@ -0,0 +1,596 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from dataclasses import dataclass +from typing import Any, Dict, Optional + +import paddle +import paddle.nn.functional as F +from paddle import nn +from paddle.distributed.fleet.utils import recompute + +from ppdiffusers.configuration_utils import ConfigMixin, register_to_config +from ppdiffusers.models.embeddings import ImagePositionalEmbeddings +from ppdiffusers.utils import ( + USE_PEFT_BACKEND, + BaseOutput, + deprecate, + recompute_use_reentrant, + use_old_recompute, +) +from ppdiffusers.models.attention import BasicTransformerBlock +from ppdiffusers.models.embeddings import CaptionProjection, PatchEmbed +from ppdiffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear +from ppdiffusers.models.modeling_utils import ModelMixin +from ppdiffusers.models.normalization import AdaLayerNormSingle +from ppdiffusers.models.simplified_facebook_dit import SimplifiedFacebookDIT + +from qdiff.base.base_quantizer import StaticQuantizer, DynamicQuantizer, BaseQuantizer +from qdiff.base.quant_layer import QuantizedLinear +from qdiff.utils import apply_func_to_submodules +from qdiff.base.quant_model import quant_layer_refactor_, bitwidth_refactor_, load_quant_param_dict_, save_quant_param_dict_, set_init_done_ + + +@dataclass +class Transformer2DModelOutput(BaseOutput): + """ + The output of [`Transformer2DModel`]. + + Args: + sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete): + The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability + distributions for the unnoised latent pixels. + """ + + sample: paddle.Tensor + + +class CustomizeTransformer2DModel(ModelMixin, ConfigMixin): + """ + A 2D Transformer model for image-like data. + + Parameters: + num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention. + attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head. + in_channels (`int`, *optional*): + The number of channels in the input and output (specify if the input is **continuous**). + num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use. + sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**). + This is fixed during training since it is used to learn a number of position embeddings. + num_vector_embeds (`int`, *optional*): + The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**). + Includes the class for the masked latent pixel. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward. + num_embeds_ada_norm ( `int`, *optional*): + The number of diffusion steps used during training. Pass if at least one of the norm_layers is + `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are + added to the hidden states. + + During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`. + attention_bias (`bool`, *optional*): + Configure if the `TransformerBlocks` attention should contain a bias parameter. + """ + + _supports_gradient_checkpointing = True + + @register_to_config + def __init__( + self, + num_attention_heads: int = 16, + attention_head_dim: int = 88, + in_channels: Optional[int] = None, + out_channels: Optional[int] = None, + num_layers: int = 1, + dropout: float = 0.0, + norm_num_groups: int = 32, + cross_attention_dim: Optional[int] = None, + attention_bias: bool = False, + sample_size: Optional[int] = None, + num_vector_embeds: Optional[int] = None, + patch_size: Optional[int] = None, + activation_fn: str = "geglu", + num_embeds_ada_norm: Optional[int] = None, + use_linear_projection: bool = False, + only_cross_attention: bool = False, + double_self_attention: bool = False, + upcast_attention: bool = False, + norm_type: str = "layer_norm", + norm_elementwise_affine: bool = True, + norm_eps: float = 1e-5, + attention_type: str = "default", + caption_channels: int = None, + data_format: str = "NCHW", + ): + super().__init__() + self.use_linear_projection = use_linear_projection + self.num_attention_heads = num_attention_heads + self.attention_head_dim = attention_head_dim + self.inner_dim = inner_dim = num_attention_heads * attention_head_dim + self.data_format = data_format + + self.inference_optimize = os.getenv("INFERENCE_OPTIMIZE") == "True" + + conv_cls = nn.Conv2D if USE_PEFT_BACKEND else LoRACompatibleConv + linear_cls = nn.Linear if USE_PEFT_BACKEND else LoRACompatibleLinear + + # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)` + # Define whether input is continuous or discrete depending on configuration + self.is_input_continuous = (in_channels is not None) and (patch_size is None) + self.is_input_vectorized = num_vector_embeds is not None + self.is_input_patches = in_channels is not None and patch_size is not None + + if norm_type == "layer_norm" and num_embeds_ada_norm is not None: + deprecation_message = ( + f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or" + " incorrectly set to `'layer_norm'`.Make sure to set `norm_type` to `'ada_norm'` in the config." + " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect" + " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it" + " would be very nice if you could open a Pull request for the `transformer/config.json` file" + ) + deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False) + norm_type = "ada_norm" + + if self.is_input_continuous and self.is_input_vectorized: + raise ValueError( + f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make" + " sure that either `in_channels` or `num_vector_embeds` is None." + ) + elif self.is_input_vectorized and self.is_input_patches: + raise ValueError( + f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make" + " sure that either `num_vector_embeds` or `num_patches` is None." + ) + elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches: + raise ValueError( + f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:" + f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None." + ) + + # 2. Define input layers + if self.is_input_continuous: + self.in_channels = in_channels + + self.norm = nn.GroupNorm( + num_groups=norm_num_groups, num_channels=in_channels, epsilon=1e-6, data_format=data_format + ) + if use_linear_projection: + self.proj_in = linear_cls(in_channels, inner_dim) + else: + self.proj_in = conv_cls( + in_channels, inner_dim, kernel_size=1, stride=1, padding=0, data_format=data_format + ) + elif self.is_input_vectorized: + assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size" + assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed" + + self.height = sample_size + self.width = sample_size + self.num_vector_embeds = num_vector_embeds + self.num_latent_pixels = self.height * self.width + + self.latent_image_embedding = ImagePositionalEmbeddings( + num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width + ) + elif self.is_input_patches: + assert sample_size is not None, "Transformer2DModel over patched input must provide sample_size" + + self.height = sample_size + self.width = sample_size + + self.patch_size = patch_size + interpolation_scale = self.config.sample_size // 64 # => 64 (= 512 pixart) has interpolation scale 1 + interpolation_scale = max(interpolation_scale, 1) + self.pos_embed = PatchEmbed( + height=sample_size, + width=sample_size, + patch_size=patch_size, + in_channels=in_channels, + embed_dim=inner_dim, + interpolation_scale=interpolation_scale, + data_format=data_format, + ) + + # 3. Define transformers blocks + self.transformer_blocks = nn.LayerList( + [ + BasicTransformerBlock( + inner_dim, + num_attention_heads, + attention_head_dim, + dropout=dropout, + cross_attention_dim=cross_attention_dim, + activation_fn=activation_fn, + num_embeds_ada_norm=num_embeds_ada_norm, + attention_bias=attention_bias, + only_cross_attention=only_cross_attention, + double_self_attention=double_self_attention, + upcast_attention=upcast_attention, + norm_type=norm_type, + norm_elementwise_affine=norm_elementwise_affine, + norm_eps=norm_eps, + attention_type=attention_type, + ) + for d in range(num_layers) + ] + ) + if self.inference_optimize: + self.simplified_facebookdit = SimplifiedFacebookDIT( + num_layers, inner_dim, num_attention_heads, attention_head_dim + ) + + # 4. Define output layers + self.out_channels = in_channels if out_channels is None else out_channels + if self.is_input_continuous: + # TODO: should use out_channels for continuous projections + if use_linear_projection: + self.proj_out = linear_cls(inner_dim, in_channels) + else: + self.proj_out = conv_cls( + inner_dim, in_channels, kernel_size=1, stride=1, padding=0, data_format=data_format + ) + elif self.is_input_vectorized: + self.norm_out = nn.LayerNorm(inner_dim) + self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1) + elif self.is_input_patches and norm_type != "ada_norm_single": + norm_elementwise_affine_kwargs = dict(weight_attr=False, bias_attr=False) + self.norm_out = nn.LayerNorm(inner_dim, epsilon=1e-6, **norm_elementwise_affine_kwargs) + self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim) + self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels) + elif self.is_input_patches and norm_type == "ada_norm_single": + norm_elementwise_affine_kwargs = dict(weight_attr=False, bias_attr=False) + self.norm_out = nn.LayerNorm(inner_dim, epsilon=1e-6, **norm_elementwise_affine_kwargs) + self.scale_shift_table = nn.Parameter(paddle.randn([2, inner_dim]) / inner_dim**0.5) + self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels) + + # 5. PixArt-Alpha blocks. + self.adaln_single = None + self.use_additional_conditions = False + if norm_type == "ada_norm_single": + self.use_additional_conditions = self.config.sample_size == 128 + # TODO(Sayak, PVP) clean this, for now we use sample size to determine whether to use + # additional conditions until we find better name + self.adaln_single = AdaLayerNormSingle(inner_dim, use_additional_conditions=self.use_additional_conditions) + + self.caption_projection = None + if caption_channels is not None: + self.caption_projection = CaptionProjection(in_features=caption_channels, hidden_size=inner_dim) + + self.gradient_checkpointing = False + + def _set_gradient_checkpointing(self, module, value=False): + if hasattr(module, "gradient_checkpointing"): + module.gradient_checkpointing = value + + # -------------------------- Quant Model Attributes ----------------------------------- + + def convert_quant(self, quant_config): + self.quant_config = quant_config + + self.quant_param_dict = {} + self.quant_layer_refactor() + + def quant_layer_refactor(self): + apply_func_to_submodules(self, + class_type=nn.Linear, + function=quant_layer_refactor_, + name=None, + parent_module=None, + quant_config=self.quant_config, + full_name=None, + remain_fp_regex=self.quant_config.remain_fp_regex, + ) + + def save_quant_param_dict(self): + apply_func_to_submodules(self, + class_type=BaseQuantizer, + function=save_quant_param_dict_, + full_name=None, + parent_module=None, + model=self + ) + + def load_quant_param_dict(self, quant_param_dict): + apply_func_to_submodules(self, + class_type=BaseQuantizer, + function=load_quant_param_dict_, + full_name=None, + parent_module=None, + quant_param_dict=quant_param_dict, + model=self, + ) + + def set_init_done(self): + apply_func_to_submodules(self, + class_type=BaseQuantizer, + function=set_init_done_,) + + def bitwidth_refactor(self): + apply_func_to_submodules(self, + class_type=QuantizedLinear, + function=bitwidth_refactor_, + name=None, + parent_module=None, + quant_config=self.quant_config, + full_name=None + ) + + def forward( + self, + hidden_states: paddle.Tensor, + encoder_hidden_states: Optional[paddle.Tensor] = None, + timestep: Optional[paddle.Tensor] = None, + added_cond_kwargs: Dict[str, paddle.Tensor] = None, + class_labels: Optional[paddle.Tensor] = None, + cross_attention_kwargs: Dict[str, Any] = None, + attention_mask: Optional[paddle.Tensor] = None, + encoder_attention_mask: Optional[paddle.Tensor] = None, + return_dict: bool = True, + ): + """ + The [`Transformer2DModel`] forward method. + + Args: + hidden_states (`paddle.Tensor` of shape `(batch size, num latent pixels)` if discrete, `paddle.Tensor` of shape `(batch size, channel, height, width)` if continuous): + Input `hidden_states`. + encoder_hidden_states ( `paddle.Tensor` of shape `(batch size, sequence len, embed dims)`, *optional*): + Conditional embeddings for cross attention layer. If not given, cross-attention defaults to + self-attention. + timestep ( `paddle.Tensor`, *optional*): + Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`. + class_labels ( `paddle.Tensor` of shape `(batch size, num classes)`, *optional*): + Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in + `AdaLayerZeroNorm`. + cross_attention_kwargs ( `Dict[str, Any]`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + attention_mask ( `paddle.Tensor`, *optional*): + An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask + is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large + negative values to the attention scores corresponding to "discard" tokens. + encoder_attention_mask ( `paddle.Tensor`, *optional*): + Cross-attention mask applied to `encoder_hidden_states`. Two formats supported: + + * Mask `(batch, sequence_length)` True = keep, False = discard. + * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard. + + If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format + above. This bias will be added to the cross-attention scores. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain + tuple. + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ + # ensure attention_mask is a bias, and give it a singleton query_tokens dimension. + # we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward. + # we can tell by counting dims; if ndim == 2: it's a mask rather than a bias. + # expects mask of shape: + # [batch, key_tokens] + # adds singleton query_tokens dimension: + # [batch, 1, key_tokens] + # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes: + # [batch, query_tokens, heads, key_tokens] (e.g. paddle sdp or ppxformers attn) + # [batch, heads, query_tokens, key_tokens] (e.g. classic attn) + # pure fp16 + hidden_states = hidden_states.cast(self.dtype) + if attention_mask is not None and attention_mask.ndim == 2: + # assume that mask is expressed as: + # (1 = keep, 0 = discard) + # convert mask into a bias that can be added to attention scores: + # (keep = +0, discard = -10000.0) + attention_mask = (1 - attention_mask.cast(hidden_states.dtype)) * -10000.0 + attention_mask = attention_mask.unsqueeze(1) + + # convert encoder_attention_mask to a bias the same way we do for attention_mask + if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2: + encoder_attention_mask = (1 - encoder_attention_mask.cast(hidden_states.dtype)) * -10000.0 + encoder_attention_mask = encoder_attention_mask.unsqueeze(1) + + # Retrieve lora scale. + lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0 + + # 1. Input + if self.is_input_continuous: + if self.data_format == "NCHW": + # (NOTE,zhoukangkang paddle inference ) make hit paddle inference elementwiseadd_transpose_pass. + batch, _, height, width = hidden_states.shape + else: + batch, height, width, _ = hidden_states.shape + residual = hidden_states + shape = paddle.shape(hidden_states) + hidden_states = self.norm(hidden_states) + if not self.use_linear_projection: + hidden_states = ( + self.proj_in(hidden_states, scale=lora_scale) + if not USE_PEFT_BACKEND + else self.proj_in(hidden_states) + ) + if self.data_format == "NCHW": + hidden_states = hidden_states.transpose([0, 2, 3, 1]).flatten(1, 2) + else: + hidden_states = hidden_states.flatten(1, 2) + else: + if self.data_format == "NCHW": + hidden_states = hidden_states.transpose([0, 2, 3, 1]).flatten(1, 2) + else: + hidden_states = hidden_states.flatten(1, 2) + hidden_states = ( + self.proj_in(hidden_states, scale=lora_scale) + if not USE_PEFT_BACKEND + else self.proj_in(hidden_states) + ) + + elif self.is_input_vectorized: + hidden_states = self.latent_image_embedding(hidden_states.cast("int64")) # NEW ADD + elif self.is_input_patches: + height, width = hidden_states.shape[-2] // self.patch_size, hidden_states.shape[-1] // self.patch_size + hidden_states = self.pos_embed(hidden_states) + + if self.adaln_single is not None: + if self.use_additional_conditions and added_cond_kwargs is None: + raise ValueError( + "`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`." + ) + batch_size = hidden_states.shape[0] + timestep, embedded_timestep = self.adaln_single( + timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype + ) + + # 2. Blocks + if self.caption_projection is not None: + batch_size = hidden_states.shape[0] + encoder_hidden_states = self.caption_projection(encoder_hidden_states) + encoder_hidden_states = encoder_hidden_states.reshape([batch_size, -1, hidden_states.shape[-1]]) + + if self.inference_optimize: + hidden_states = self.simplified_facebookdit(hidden_states, timestep, class_labels) + else: + for block in self.transformer_blocks: + if self.gradient_checkpointing and not hidden_states.stop_gradient and not use_old_recompute(): + + def create_custom_forward(module, return_dict=None): + def custom_forward(*inputs): + if return_dict is not None: + return module(*inputs, return_dict=return_dict) + else: + return module(*inputs) + + return custom_forward + + ckpt_kwargs = {} if recompute_use_reentrant() else {"use_reentrant": False} + hidden_states = recompute( + create_custom_forward(block), + hidden_states, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + timestep, + cross_attention_kwargs, + class_labels, + **ckpt_kwargs, + ) + else: + hidden_states = block( + hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + timestep=timestep, + cross_attention_kwargs=cross_attention_kwargs, + class_labels=class_labels, + ) + + # 3. Output + if self.is_input_continuous: + if not self.use_linear_projection: + if self.data_format == "NCHW": + hidden_states = hidden_states.reshape([shape[0], shape[2], shape[3], self.inner_dim]) + else: + hidden_states = hidden_states.reshape([shape[0], shape[1], shape[2], self.inner_dim]) + if self.data_format == "NCHW": + hidden_states = hidden_states.transpose([0, 3, 1, 2]) + hidden_states = ( + self.proj_out(hidden_states, scale=lora_scale) + if not USE_PEFT_BACKEND + else self.proj_out(hidden_states) + ) + else: + hidden_states = ( + self.proj_out(hidden_states, scale=lora_scale) + if not USE_PEFT_BACKEND + else self.proj_out(hidden_states) + ) + if self.data_format == "NCHW": + hidden_states = hidden_states.reshape([shape[0], shape[2], shape[3], self.inner_dim]) + else: + hidden_states = hidden_states.reshape([shape[0], shape[1], shape[2], self.inner_dim]) + if self.data_format == "NCHW": + hidden_states = hidden_states.transpose([0, 3, 1, 2]) + + output = hidden_states + residual + elif self.is_input_vectorized: + hidden_states = self.norm_out(hidden_states) + logits = self.out(hidden_states) + # (batch, self.num_vector_embeds - 1, self.num_latent_pixels) + logits = logits.transpose([0, 2, 1]) + + # log(p(x_0)) + output = F.log_softmax(logits.cast("float64"), axis=1).cast("float32") + + if self.is_input_patches: + if self.config.norm_type != "ada_norm_single": + conditioning = self.transformer_blocks[0].norm1.emb( + timestep, class_labels, hidden_dtype=hidden_states.dtype + ) + shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, axis=1) + hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None] + hidden_states = self.proj_out_2(hidden_states) + elif self.config.norm_type == "ada_norm_single": + shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, axis=1) + hidden_states = self.norm_out(hidden_states) + # Modulation + hidden_states = hidden_states * (1 + scale) + shift + hidden_states = self.proj_out(hidden_states) + hidden_states = hidden_states.squeeze(1) + + # unpatchify + if self.adaln_single is None: + height = width = int(hidden_states.shape[1] ** 0.5) + hidden_states = hidden_states.reshape( + shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels) + ) + # hidden_states = paddle.einsum("nhwpqc->nchpwq", hidden_states) + hidden_states = hidden_states.transpose([0, 5, 1, 3, 2, 4]) + output = hidden_states.reshape( + shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size) + ) + + if not return_dict: + return (output,) + + return Transformer2DModelOutput(sample=output) + + @classmethod + def custom_modify_weight(cls, state_dict): + if os.getenv("INFERENCE_OPTIMIZE") != "True": + return + for i in range(28): + map_from_my_dit = [ + (f"q.{i}.weight", f"{i}.attn1.to_q.weight"), + (f"k.{i}.weight", f"{i}.attn1.to_k.weight"), + (f"v.{i}.weight", f"{i}.attn1.to_v.weight"), + (f"q.{i}.bias", f"{i}.attn1.to_q.bias"), + (f"k.{i}.bias", f"{i}.attn1.to_k.bias"), + (f"v.{i}.bias", f"{i}.attn1.to_v.bias"), + (f"out_proj.{i}.weight", f"{i}.attn1.to_out.0.weight"), + (f"out_proj.{i}.bias", f"{i}.attn1.to_out.0.bias"), + (f"ffn1.{i}.weight", f"{i}.ff.net.0.proj.weight"), + (f"ffn1.{i}.bias", f"{i}.ff.net.0.proj.bias"), + (f"ffn2.{i}.weight", f"{i}.ff.net.2.weight"), + (f"ffn2.{i}.bias", f"{i}.ff.net.2.bias"), + (f"fcs0.{i}.weight", f"{i}.norm1.emb.timestep_embedder.linear_1.weight"), + (f"fcs0.{i}.bias", f"{i}.norm1.emb.timestep_embedder.linear_1.bias"), + (f"fcs1.{i}.weight", f"{i}.norm1.emb.timestep_embedder.linear_2.weight"), + (f"fcs1.{i}.bias", f"{i}.norm1.emb.timestep_embedder.linear_2.bias"), + (f"fcs2.{i}.weight", f"{i}.norm1.linear.weight"), + (f"fcs2.{i}.bias", f"{i}.norm1.linear.bias"), + (f"embs.{i}.weight", f"{i}.norm1.emb.class_embedder.embedding_table.weight"), + ] + for to_, from_ in map_from_my_dit: + state_dict["simplified_facebookdit." + to_] = paddle.assign(state_dict["transformer_blocks." + from_]) diff --git a/ppdiffusers/examples/pixart_quant/example/pixart/prompts.txt b/ppdiffusers/examples/pixart_quant/example/pixart/prompts.txt new file mode 100644 index 000000000..fe6b1a063 --- /dev/null +++ b/ppdiffusers/examples/pixart_quant/example/pixart/prompts.txt @@ -0,0 +1,1024 @@ +A goat with very long horns standing in front of a tree. +a cat that has a shirt on its back +Man and woman holding a knife up to a small cake. +A red fire hydrant is leaking onto a side walk. +A man riding down a snow covered slope in the snow. +A table with two TVs on top of it next to four remote controls. +An orange and white cat laying on top of black shoes. +A man with a bear wearing glasses in a business suit. +A living room filled with furniture and a flat screen TV. +A room filled with furniture and very large rugs. +A pastry meal sitting on a trey next to a bottle of orange soda, +A group of people standing around a lush green field holding a kite. +Bathroom with tub, toilet, sink, mirror and window. +A dirty bathroom consisting of a toilet, bath tub and sink. +A kitchen area with stove, shoveling and a dishwasher. +A man on a motorcycle going down the street. +This is a black and white photo but the bananas are in color +A person near a bike and a car on a street. +An open area with numerous bags and moving trucks. +A table holds a cheese pizza and condiments on a checkered tablecloth. +A little girl holding a blow dryer next to her head. +The farmers are attempting to sell their products at market. +two different bears fight with each other behind a log +Contents of a refrigerator inside compartments on the door. +A refrigerator with a box sitting on top and a stove with a shelf above it that has a toy red truck and candles. +An old clock stands in a dilapidated, messy room. +a number of people on bikes under a traffic light +A group of people flying kites in a blue cloudy sky. +A guy with a backpack looking at the ground to his left. +A man holding up a bunch of root vegetables for sale. +a man sits in a recliner with his laptop +A woman sits holding an umbrella near the group of women. +A brown horse standing next to a woman in front of a house. +A bathroom with a toilet, towel rack and a tub in it. +A couple of fishing boats docked next to pier on the ocean. +An adorable cat attempts to hide in a purse to steal the persons identity. +Two bulls who are walking on a street. +A classic car parked next to a large metal Canadian airplane. +A brown dog laying next to a bottle of wine +A group of three people sitting at a table with food. +A large doughnut sign above a shop for doughnuts. +Photos of two ferrets sleeping in a pet bed +A closeup of an apple in the foreground with three oranges in the background. +The basketball players are playing in a game +A man prepares food in a restaurant kitchen. +A colorful plate with a hotdog on it +A motorbike sitting in front of a wine display case +A person wearing a vest and tie in a yearbook photo +a small bathroom with a toilet in it +Two airplanes flying through a blue sky with smoke pouring out of their rear ends. +A street sign is filled with various stickers. +a man is standing behind a glass counter +A cook in a restaurant kitchen poses with food. +THERE IS AN ADULT CAT THAT IS LOOKING AT SOMETHING +A cat on land staring at a swan in the water. +A man is standing in a suit with his head cocked to the side. +An elephant walks down a dirt road with brush on either side. +A guitar on a bed in a room. +A bedroom with a bed next to a closet. +the bus is blue and is stopped. Some people are standing waiting for it +A red-headed man with glasses is looking at his laptop. +a man using a red phone receiver to talk on his cell phone +Small children with protective gear playing in a park. +A plate of food that includes lentils and leafy greens. +a large cow stairs across the snowy fields +A stop sign with a sign above it that reads, "David". +The van is driving down the street in traffic. +A man eating pizza without using his hands. +A table topped with bananas next to a coin. +A group of people at a party sitting next to a red umbrella. +A man holding up a mustard covered hot dog to his own face. +Two traffic lights showing red on a pole with a camera. +A commuter Amtrak train on the tracks +The man looks eagerly at a huge pizza in front of him. +A store with lots of unripe bananas and other products. +A street sign showing how to drive around a divide +A train traveling under a signal lights on top of tracks. +A baseball game being played before a crowd. +A selection of pottery vases in various colors +A ducks is flying over a body of water. +A young boy swings a baseball bat in the backyard. +A green bus is turning in front of a grey car. +A snowboarder standing in the snow next to a fence. +A black and white cat stares downward at a window. +Kneeling on the skateboard, the rider uses their arms to propel them. +a long line of red white and blue tourist buses +A bus and car wait at an intersection on a city street. +A cat laying on a wooden table under a umbrella. +there are many pieces of broccoli and vegetables here +A man sitting at a table with two laptops on it. +a living room with a couch and a coffee table +A big commercial plane flying in cloudy skies. +Multiple images overlaid of several women playing frisbee. +A couple of adult elephants with a baby one following along. +A living area with a red couch, mirror, window and painting. +an elephant walking in a field near many trees +A man and a young boy pull luggage next to each other. +A very tall brick clock tower sitting under a blue sky. +A traffic light with a sad face on the red light. +Red hot water cooker, bowl of fruit, bananas, cereal and a sliced orange sit on a white tile kitchen counter tip. +A computer keyboard sitting next to a computer mouse. +A tall clock tower made out of red bricks. +a close up of a motorcycle parked near a building +An airplane with wheels wheels barely off ground tilted slightly upward from the pavement to the blue sky. +A view of a snowboarder's own legs and feet with a snowy hillside behind. +A rusty old fire hydrant on an inner city street +The man in a costume suit is talking on a cell phone. +Toothbrushes and toothpaste lay on the counter by the sink. +two cups filled with veggies and nuts +A small propeller plane sitting on top of a field. +A man and woman sitting on a red bench +A man is skateboarding near the parked cars, +a basket of food on a table with fries +A baseball player taking a swing at a ball +A man about to stand up on a surfboard in the ocean +A bottle sitting on top of a white shelf under a metal shelf. +An airplane flying over a man standing in the ocean. +A bunch of sheep in the snow behind a barbed wire fence. +A squash sits on top of three bananas to form a claw. +A zookeeper brushes an elephant within its enclosure. +a girl with glasses standing in a kitchen +A man riding a snowboard over a snow covered slope. +Some African Americans are dressed in ties and working. +a bathroom with a toilet and a sink +a large silver plane is put on display at a museum +A skier dressed in red on the slopes during a snow storm. +a women combing her young daughters hair +A blue toilet bowl in a bathroom that says rabies +A woman sitting at a table with a plate of food in front of her. +A smart phone sitting next to a canned soft drink and a pen on a wooden table. +A small grey refrigerator on wheels and a sand fan +This a view from an airplane of the landscape below. +A person riding a beautiful wave very smooth +The tablet is next to a pair of scissors. +Two construction workers are walking in front of a cone and parked cars. +A view of a bathroom with an old tub and a hanging shower curtain. +A man walking on the beach flying a kite. +Young man up to bat during a ball game. +A plate of food placed next to a computer. +A woman in a short skirt holding a tennis racquet. +A young woman wearing mime makeup holding a racquet. +A chair and table with two monitor screens +An intersection with cars is pictured in this image. +Some people standing in front of a large building. +Two people walking towards a beach holding surfboards. +A plate of food with shrimp, pasta, and salad. +A shirtless man in a hat making luch +A surfer is at the peak of a wave and circling around. +Dog standing on top of cow inside the fenced area +an elephant standing on a dirt ground with trees in the background +NASA men making first, ceremonial pitch at a baseball game. +A bedroom with a bed under a window next to a green rug. +A woman sitting on a couch in front of an open laptop computer. +A young man is doing a trick over some stairs. +A person is doing a high jump on some skis. +a green truck parked in a dirt field with green shrubbery +The train moves thru this part of the city. +A carriage with people in it being pulled by two horses in the street. +A group of bikers riding down a street in traffic. +A red and white train sitting still on a train track +A young man riding a skateboard down the side of a ramp. +A small white bard with a long beak on a branch. +Group of people outside and one pointing up to the sky. +A couple of zebra standing next to eachother +A large jetliner with a man sitting inside of a compartment. +a cow walking on a city street near people +This is a view inside the cabin of the high speed train, showing relection of the light. +A white bus parked in a parking lot next to a car. +Hot dogs with ketchup and plain chips +Two men on blue tennis court playing game. +A man riding on the back of a green motorcycle. +A very tall clock tower with clocks on each of it's sides. +A couple of semi trucks parked next to each other. +A sidewalk sitting along side of a building at night. +A man standing in front of a picture of a building in plaid shorts. +A very tasty looking fruit salad on a piece of pita bread. +A young man jumping off a staircase while riding a skate board. +A lamp in the window of a house. +A city street is crowded with people and no cars. +A laptop computer sitting on top of a desk near a monitor. +A blue and white plate holds cut pieces of pizza on it. +Man and woman on an ice covered bridge. +A cozy bed stands between two windows within an upstairs bedroom. +A small bathroom with marble tiles and counter. +The table is holding, pizza, eating utensils and glasses of orange juice. +A young man wearing a tie and a blue shirt. +A string of Christmas garland hangs below a display of stuffed animals. +a man sitting on a bench looking a bit like forrest gump +A young man is sitting on his blue motorcycle. +A group of horses are grazing in the field. +The pizza on the table is half eaten. +a young girl is holding a teddy bear +A woman in her underwear riding on top of a paddle boat. +A table that has a cup of coffee and a book set on it. +A man with a saw standing next to a man with his brain exposed. +A dog herding a group of sheep in a meadow +a young child on a skate board indoors +A street full of confetti and streamers with a building in the background. +A european city in nice a sunny bright day +A woman sitting at a table while eating food. +A man looking in the refrigerator with a cat also looking inside +A road filled with snow and traffic lights. +Large clock tower with ornate brick work and windows. +Small black bird standing on a wooden table or deck. +A large white airplane with blue tail parked. +Closeup of a hand holding a black cellphone. +A lady eating a slice of deep dish pizza. +A big commercial plane flying high in the sky. +Motorcycle parked on the curb of an empty road. +a bicycle in a living room with a tv +there is a sheep behind a fence standing in the snow +A man standing near home plate swinging a bat. +A beautiful girl sitting at a table with an orange. +A living area with a fan and furniture covered in white blankets. +A giraffe looking over the corral fence in his zoo habitat. +A person riding on top of a surfboard on water. +A white toilet tin a bathroom sitting next to a sink. +a steam locomotive on a narrow gauge track in the snow +The room is filled with remarkable items well suited. +A man riding skis down a snow covered slope. +a close up of a giraffe and a zebra in a field near trees +A plush alligator on a bed covered with a bedspread and pillows with a lamp on above the bed. +A sandwich and some sides sit on a tray on a table. +A wine bottle and glass sit on a table in front of a couch. +There are a lot of sailboats anchored in the water today. +A kitchen with a metallic refrigerator and stove top oven. +An old fire hydrant in the middle of the woods. +A bear in the woods standing on a log. +A man sitting on a chair on a boat dock. +A very cute cat laying by a big bike. +A photo of a man swinging a tennis racket. +You can mash food together so that no one will know what it is. +A living room consisting of windows, rugs, chairs, and a coffee table. +a woman is throwing a tennis ball to serve +a woman laying in bed with just a blanket covering her +A large elephant standing on top of a grass covered field. +A decorative vase with some yellow flowers in it. +A small computer screen opened in the room. +A baby elephant in dirt area next to a fence. +Blurry color photo of people enjoying a ski resort area +A girl in white shirt playing with a yellow frisbee. +A man holding a phone up to his ear. +A woman scanning a tour map with her smartphone. +A group of people, sitting in bleachers, watch a boy bat at a little league baseball game. +A person cross country skiing on a road. +An elephant walking down a street next to a crowd of people. +An orange lying next to a green utensil. +Bathroom with a white tiled floor and sink. +a person that is jumping his skateboard doing a trick +A man swimming in a pool of blue water. +A parking meter next to a handicap parking space. +A white and blue city bus traveling down a city street. +Three giraffes standing amongst a copse of trees. +A white bed sitting between two lamp under a picture. +Science project to operate a red, blue and green light +A pizza topped with sliced up bananas on a pan. +A man holding a plate standing inside of a kitchen. +A cat on a pet bed looking at a laptop. +Three people are at a table, one with a banana and one on a cell phone. +Two lap tops sitting next to each other on a computer desk +A person and a large elephant standing together. +A woman talking on a cell phone while holding a cup of coffee. +Two cakes sitting on a class table near a candle. +A couple of cows on a dirt road. +People in navy uniforms and one person talking on a walkie- talkie. +A white bowl filled with vegetables sitting on top of a table.. +A group of men standing on a street corner next to a stop sign. +Several fruits with Chinese characters written on them. +A plate of food sitting on a black tray +a blonde cat is standing by a laptop +a close up of a person with a tie in his mouth +A lunch box with a slice of pizza, and pretzels. +A small herd of giraffe standing next to each other. +Two people standing next to each other on top of the snow. +there is a man walking threw a hallway with his luggage +A man on a motorcycle on a racetrack. +a rusty flatbed truck sitting by a building +Three people standing around a brown horse with a white stripe on his face. +A group of people stand in a circle and eat a food. +A red passenger train at an underground platform. +A man wearing a white shirt and a flowery tie. +A building in the sky that has the lights on. +Fish eye angle view of small kitchen with fire extinguisher at far end. +An orange truck with some people in back +A train traveling down train tracks next to a building. +A group of people are taking surfing lessons. +A bedroom has many posters on the wall. +A group of children playing a game of soccer. +A yellow fire hydrant sitting on the side of a road. +That is something being here taken in the picture. +A red fire hydrant on the side of a street. +A closeup of a cream and orange colored cockatiel. +A laptop computer and desktop computer monitor on top of a desk. +A pile of assorted fruits and vegetables on a counter. +A calico cat standing on top of an upholstered chair. +Mr. and Mrs. Santa Clause standing on the side of a road as Santa points to the sky. +A man and a woman standing in a field flying kites. +small boats parked in the ocean at sunset +A thin crusted pizza dish cut into four slices +A road sign next to a parking lot that reads "FLAMING LIPS ALLEY". +Two women looking at three giraffes behind a fence. +The Giraffes are almost camouflaged in the terrain behind them. +Two bikers, one in front of a building, the other in the city. +A plate of meat, vegetables, and two large slices of bread with a red wine glass next to it. +A clock tower surrounded by blowing snow in a city. +A herd of sheep grazing on a lush green hillside. +A neat living room in a wood cabin. +The orange fire hydrant is on a sidewalk in front of a brick building. +A black cat with crazy eyes wearing a bib. +A silver miniature train with trees in background. +a man setting up his kite to fly +A white toilet sitting next to a white sink. +A girl lies on a couch with food in her hand. +A woman hitting a tennis ball on a tennis court. +A toilet, sink, mirror, and tub in a bathroom. +A beautiful woman in short shorts standing next to a river filled with elephants. +a man in a kitchen cutting lettuce with a knife +A dish of broccoli is sitting on a plate. +A black and white dog is catching a red frisbee. +A security officer using a segway as a footrest +A slice of cheese cake sitting on a plate next to a fork. +A man is surfing a large wave towards the shore. +A building wall and pair of doors that are open, along with vases of flowers on the outside of the building. +A purple vase filled with multi colored flowers on top of a table. +A vintage truck with a surfboard on top is painted matted black. +A large truck stops on an infrequently traveled wooded road. +A person walking down a street while holding an umbrella. +Kids enjoying the skateboard park on a sunny day +A living room with boxes packed and a wine glass on a table. +A large brown dog laying on top of a couch. +An SBS Transit double deck bus on a city street. +A giant Amoco sign sitting above a gas station. +A man that is sitting down on a bench in front of a table. +A young boy wearing a catchers mitt while standing on top of a field. +a bowl full of mixed vegetables and pieces +A dinner table shot of a person holding a sandwhich accompanied by fries and beverages +Dog lying on bedding material in suitcase in large room. +A pile of broccoli is being displayed on a plate. +A skier that just made a jump several feet up in the air. +A long yellow and green train traveling past a lot of trees. +Double decker bus with dancers advertised in the side +a close up of many different vegetables on a table +A group picture of young men and women at an event at night. +there is a red stop sign on this street +The woman is talking on a cell phone +A plate full of broccoli and asian noodles. +a person holding a pillow above a ripe banana +A woman in a bikini top carries a surf board near the water. +a yellow and green train, at stop lights. +The man is riding his motorcycle while smoking a cigarette. +A group of people sitting on benches having lunch together at a curbside garden area. +An Ox standing in a street while a person rides by on a bike. +A stop light that is green that also has various other street markers on it. +A very pretty girl working with some food in a kitchen. +Six uncooked doughnuts sit on a baking tray. +two kayakers enjoy the clear open water +Two ultimate Frisbee players jumping to contest a Frisbee. +Several views of mean playing with a white disc on grass. +White bowl full of chopped carrots and broccoli. +a public transit bus parked with its doors open +A man riding a snow board on top of a snow covered slope. +An advanced home office with three different computers +A female tennis player stands close to the net holding a racket. +A building with a large circular window that has a designed iron bar cover on it and a pigeon flying from it. +a child on a raft with a paddle +this man is riding a board near a field +A woman standing next to a podium holding a tennis racquet. +a couple of people on skis ride through the snow +A couple of sheep sitting on top of a lush green grass covered hill. +a clock hanging from rafters of a building +THERE IS A WALL WITH FLOWERS ON IT AND A BIRD +A couple of suit cases sitting on top of a wooden block. +A "Virgin Records" train next to a blue, yellow and red train at a subway station. +A group of young children playing with a soccer ball. +a baseball player swinging a bat on a field +A bike parked in front of a parking meter. +A tennis player in action on the court. +Four men hold a kite shaped like a clock. +That doll has really dark and black eyes. +A man taking a selfie in a bathroom mirror. +A group of people riding skis down the side of a snow covered mountain. +Many boats tied to the edge of the bank. +A tall glass vase on a balcony. +A group of people holding umbrellas in the middle of a graduation. +Line of people skiing down a snowy slope. +A close-up of a plate of pasta and meat with a piece of broccoli. +A white dog is on a sandy beach while the sea foam washes ashore behind it. +A picture of a silver colored blackberry phone with a graph on the screen. +A man is walking down the street next to a pole with a clock. +A wooden table topped with a computer monitor and two keyboards. +A man holding a Nintendo Wii controller in front of a TV. +Wooden table with two plates full of cheese pizza. +Two children enjoy a meal at a restaurant. +a large clock surrounded by statues and emblems +An adorable baby laying in bed with a stuffed brown teddy bear. +A group of people riding on the back of an elephant. +a small girl and two giraffes and some trees +Mirror view of a bathroom with a sink and tub. +A speedboat pulls a water skier who is flying in midair. +The two zebra are walking in a single file line. +The furniture in the living room is decorated with flowers. +The young girl is posing with a softball bat. +Rows of motor bikes and helmets in a city +A couple of people riding a pair of skis down a snow covered slope. +Road is littered with old cars and some wrecked trucks with a lone red and white stop sign at a grassy lot +A toilet in front of a window, and next to the shower are shown +A group of elephants are walking on a dirt road. +Two skiiers kneeling in the snow by an orange flag. +A traffic sign with writing on it at an intersection. +Bunches of garlic and bananas hanging from a stall. +A concrete building with towers, a steep in the middle and a clock underneath. +A family of five takes a ride on an elephant. +A young lady kicking a soccer ball on a field. +A group of boats sitting on top of a beach. +Three bananas that are sitting next to a laptop and cellphones. +An advertisement on a trailer of baseball player +a group of little elephants standing close to each other +a sausage pizza a butter knife and a black platter and plate +A person laying in bed in front of a TV. +A laptop computer sitting on a desk next to a desktop computer. +a small child holding a tennis racket with two hands +A group of three dogs standing on the lawn +Baseballs players sliding to base and jumping during the game. +A person with a hat on flying a kite by a plane flying in the sky. +A wet road with lots of cars driving over it. +A couple of plants sitting inside of a pot near a window. +A car driving through a tunnel under buildings +The halved melon is on the counter next to the remote. +A couple of zebra standing next to a rhino. +A see through container with a lemon and apple dropped in it. +A group of teenagers are playing tennis outdoors. +A red and yellow bus sits on the back of a flatbed truck, driving down the highway. +Several street signs are mounted in an urban neighborhood. +Several motorcycles that are parked on the side of the street. +A yellow and white train traveling down train tracks. +Sheep with numbers painted on them in a green grass field together. +A person on a skateboard up in the air. +A large white stove top oven in a kitchen. +An albino elephant with its baby standing in a marsh area with others. +People are in a large building with luggage. +a desk with a banana a keyboard and a mouse +Two very large birds pose outside on white chairs, side by side +A microwave oven sitting on top of a counter top. +A woman wearing a net on her head cutting a cake. +a public transit bus on a city street +a kitchen with a refrigerator a sink and a stove +A cat curled up on a bed sleeping with a man sitting in the back ground holding a laptop and watching the cat. +A woman standing on a beach is surrounded by birds. +An upright clock near wooden tables and chairs. +a guitar an amp a desk a keyboard and a monitor +A teddy bear can be seen from outside the window. +A man about to hit a tennis ball on a concrete court. +Man performing skateboarding trick on cement in daylight +A red tray of food on a table. +A white semi truck parked on a large muddy puddle. +A copse of men sitting next to each other at a table. +A group of young people sitting on a couch next to a guy playing a Nintendo Wii. +A woman on a bench plays an accordion as a man looks on. +A couple of men playing a game of frisbee. +A couple of trains traveling down train tracks. +A train with several cars is going around a corner. +A wooden table with teddy bears sitting around it. +A woman standing in front of an oven near another woman. +A young boy cuts into a cake shaped like a skateboard. +A bear walks alongside a road near a tree. +A person is taking a picture of a hotel bathroom. +Group of folks playing bowling on Wii sports +A close up of a woven basket on a bicycle. +A computer mouse is sitting on top of a keyboard. +this is a street with a brick building +Several people in a living room sitting on different sides of the room +The two filaments of matter have been separated from each other. +An adult black bear let's his long tongue protrude. +A group of people sitting around a circular table with food. +the table has different foods and drinks on it +A group of people walking down a busy city street. +A baby elephant panting on a white canvas. +Artwork of a ship with three masts and one sail open with a scull and crossbones on it, in bluish gray water with gray cement wall in background. +A black cat sitting on top of a wooden fence. +Two surfboards on a beach in an urban setting. +a street pole with some signs sitting on them +A large passenger jet sitting on top of a runway. +A zebra in the grass who is cleaning himself. +three pieces of art made in white with gold details. +An airport with a woman carrying a piece of luggage. +Bright orange flowers in a jar partially filled with water. +A surfing simulator with actual water in the city. +A man in a blue outfit holding a white frisbee. +Young man flying a kite over a grassy area by water. +A baseball player holding a bat over home plate. +An adult and two baby elephants standing near a fence. +An adult and baby giraffe walking through a field. +A surfer is squatting down to prepare his board. +A long, narrow yellow kitchen with black and white floor tiles. +a living room with a chair near a tv +A horse and rider jumping over a bar on a track. +A silver train traveling past a train stations. +A living room display at an Ikea store +City buses prepare to leave the bus station +group of giraffes mingling in a stand of trees +A clock sitting in the middle of a walkway. +A group of people standing near a clock tower +an elephant standing by some trees with it's trunk in the air +A man holding a red and white surfboard in front of an elevator door. +A giraffe with its teeth on a fence. +a young girl is on her phone outside +Two small clocks sit behind a glass window. +The dog is laying down in the suitcase. +People watching elephants enter the water by a river. +A black and white picture of a building with a clock outside. +A fenced in pasture with four horses standing around eating grass. +A large black bear standing by a tree. +A red stop sign sitting under a green street sign. +there is a man and a woman playing a video game together +A man and a young boy dressed in matching white dress shirts and ties. +Skiers doing stunts over a hill of snow. +A big rock walkway is on a hill beside the ocean. +A brown cow laying on top of a lush green field. +A bed covered in creepy black blankets and pillow cases +The little girl is dressed in pink and holding a umbrella. +An empty side walk with in a city +Two peanut butter and jelly sandwiches sliced in half +A group of horse mounted police standing in front of a crowd. +many horses st a horse stable with people walking by +The bananas on the tree are not ready to be picked. +A man holding a bunch of ripe bananas. +A white toilet bowl with a cleaner thing in it +A man pulls a rolling suitcase and is wearing a suit. +A blue boat docked on a green lush shore. +A young man swinging a baseball bat on a baseball field. +A stop sign with green ivy growing on it at a corner. +A woman on the beach playing with a frisbee. +A downtown urban street scene with skyscrapers and historic buildings. +A black cat staring into the distance in a room +A big pizza and some other assorted food items. +Group of people standing at a dinner table with different color plates. +a man standing on the side walk while holding a sign +A large metal clock tower below very tall building. +A family on the back porch sitting around a table. +The little boy is practicing riding a surfboard at the carnival. +A brown dog standing next to a white rabbit. +A woman holding a birthday cake with one candle near a man with a baby in his lap. +A herd of cows walking down a small country road. +A cat on a city street with people. +A white car is parked in the street at night time. +A white plate with a cut in half sandwich and lettuce. +a hotel rooom with two beds and a food tray on one bed +A dog sitting in the passenger seat of a car. +A person pretending to light a man on a laptop screen's cigarette. +A group of ducks floating on top of a lake. +a batter that has just hit the ball +A plate with grated coconut and decorations on the table +A blending mixer sitting on a kitchen counter. +A small bedroom has striped wallpaper going floor to ceiling +A bowl with rice, broccoli and a purple relish. +A green motel sign hanging from a fake cactus. +A woman wearing a military uniform injecting a cow. +A large free standing street clock with a cloudy sky. +A man riding a skateboard off the side of a ramp. +A display case filled with assorted flavored donuts. +Many kites in different shapes and sizes fill the sky. +A man examines an electronic device standing by a table. +A view of a baseball game during the day. +A brown dog standing next to a black dog as they fight over a frisbee. +A group of people riding on the backs of motorcycles. +A bunch of bananas hanging above somewhere on a ceiling. +A group of motorcyclists stay in a group in traffic. +A bicycle store shows two males leaning toward a bike. +A bare bathroom has a window over the toilet. +A couple of elephants standing in shallow water. +A man standing on a tennis court holding a racquet. +A young man hitting a tennis ball with a racquet. +A black and white shot of people standing in the rain in front of a castle structure. +A bunch of green bananas hanging from a banana tree. +A person on a skateboard does a trick on stairs. +A group of kids play on a grassy area and lift their arms up as a Frisbee is just above their heads. +A few Zebras are standing in the wild together. +A herd of sheep are grazing in a green field. +A unique restroom is seen in this image. +A woman sitting on top of a red bench next to a man on a bike. +a dog laying on a bed with a tv in the background +a close up of a plate of pizza near a glas of beer +an umbrella set up on a beach in the sun +A cat sleeping in a large piece of luggage. +The book was laying on the unmade bed. +A small bathroom with a toilet, sink and window. +there are many skiers standing in the snow +A person squatted on skis in the snow. +A pretty lady laying in bed with a large black dog. +Several beautiful vintage racing motorcycles on public display. +A street scene with a double decker bus. +A man with a tennis racquet is dressed in white. +A giraffe standing in the middle of a field. +Some very cute giraffes in a big grass field. +A living room with a sofa chair and television. +A zebra laying next to another zebra on a dirt covered floor. +A pitcher throwing a ball during a baseball game. +A toilet and sink side by side in a bathroom and a mirror. +A black cat is sitting on a suitcase looking up. +A red table holding a black laptop with red rope on it. +The airplane in the sky is doing tricks while spitting out smoke. +A large sandwich filled with meat on a plate. +A motorcycle parked on the pavement near a building. +A person kiteboarding over the ocean on top of waves. +An open laptop computer sitting next to a phone. +A lush green field topped with lots of vases. +person in shadow watersking in large body of water +A bear pokes about in the water while seagulls look on. +A calico cat is curled up on a mat taking a nap. +A bird is perched on a boat which is anchored in the water. +A zebra looking alertly at the camera while in the field. +Several people are seen sitting around and smoking. +A harbor filled with boats surrounded by buildings. +A bird is sitting on a branch among unfocused trees +Man laying on bed with shirt open looking into device for picture. +A room full of guys and girls sitting a various tables, some have laptops and food. +A modern kitchen with stainless steel fridge, wooden cabinets, and overhead lighting is separated from the dining room by a bar. +a man leans against a brick wall while he talks on his cell phone. +A person on a skateboard does an air trick. +A wooden table topped with vases full of flowers next to a brick wall. +A pair of shoes sits on a bench next to a door. +A red fire hydrant on a concrete block. +A sailboat with a dragon on it and the word Tolkien +A city street filled with traffic and buses. +Raw cookies in a pan on the counter and baked cookies in a pan on the stove. +A black and white dog sitting on a park bench +A toddler boy is on a white pad playing a video game. +An orange and black fire hydrant at the edge of a street. +A man does a grind on the curb with his rollerblades +A person riding down a trail in front of a person on skis. +A man peers over a small plate behind a napkin lined with pieces of fruit. +A white cat standing next to a cluster of palm trees. +a white plate with some food and two trays of sauce +A dog standing on a bed in a room. +A plate of food with a sandwich with a runny egg. +A motorcycle parked next to a black truck. +A man with a orange headband is playing tennis. +A refrigerator door is open and it is full of foodstuffs. +A man riding a skateboard on top of a skate park. +A plate of broccoli and meat on a table +A large white polar bear walking across a dirt and gravel ground. +A man standing behind a bench with something in his hands. +A group of people standing outside of a food truck +A giraffe standing in a lot filled with cars. +A person that is on his cell phone in a car. +A plate of food sits on a white table. +A person standing in a kitchen holding a Nintendo Wii controller. +A plate filled with eggs and sausage next to a cappuccino. +A black/white photo of a poolside dining area with the umbrellas colored. +People crossing the street at a busy intersection +A mirror sitting on top of a wooden dresser. +An assortment of items that were in a purse. +Blue scissors cutting up a plastic credit card +A man standing next to a motorcycle on the side of a hill. +A street sign next to a public road in a city. +A picture of a street named Bill Robertson +there is a zebra that is seen standing next to a tree +A line of wagons with vendors selling fruit. +A group of teddy bears sitting on top of a bunk bed. +A woman that is standing holding a remote. +A man riding a snowboard down a slope with many hills. +A hotel room with two beds and a painting on the wall. +A woman with a umbrella on a city street. +Two zebra feasting on a dead animal in a dry grass field.. +a family that is eating at a table +A red stop sign with a picture of a Time Magazine cover underneath stop. +A white bed with a turquoise blanket, with a two piece painting hanging above it. +A close-up of an orange on the side of the road. +A man holding a tennis racquet in front of a crowd. +broccoli being sauteed in a pan with a wooden spoon +A cat sleeping on a pillow next to a book. +A transit bus stopped at a street side that's filled with snow. +A poodle is laying on the back of a green couch. +A three wheeled motorcycle parked with some other old vehicles. +A giraffe standing next to a pile of stones. +A woman that is standing up with skis. +A white and blue bus driving past houses on a city street. +A brown teddy bear sitting in a red chair. +A plate on a wooden table full of bread. +An antique living room with elegant looking furniture. +A large truck on the side of a street. +A lion head mask is hanging in an outdoor shop. +A horse eating hay standing in a mowed field. +The snow is very crowded with snow skiers. +A man trying to snowboard in the dark. +A man and two sheep stand on green grass with a castle in the background. +An elephant and a bunch of cattle at a watering hole. +A couple of beds sitting in a bedroom under paintings. +A small black sign on the refrigerator door +A shower equipped for a physically challenged person with a shower chair +A girl jumping in the air under a kite. +A woman sitting under a hair dryer in a salon. +A kitchen with lots of cabinets with an oven underneath a microwave oven. +A group of people on motorcycles waiting at a traffic light +Snowboard stuck in a thick collection of trees. +a baseball player hitting a baseball with a wooden bat +the door to the room is open to the outside +A cake shaped like a horse with white frosting and decorative candies in different colors. +A sandwich sitting on top of a large wooden block. +an umbrella in a field of flowers +a man with a racket prepares to hit a tennis ball +Luxury double decker bus ready to depart station. +A large black bear walking next to a huge stone wall. +Two messy toilet stalls with toilets where one lid is raised. +Woman adjusting man's tie in occupied room with others. +A succulent flower has a lime green center. +A man standing in front of a bathroom mirror. +A group of people flying a kite in a field. +A person on a court with a tennis racket. +A man with an umbrella sits on the hood of a car. +A group of people riding in the back of a truck. +A collage of boys dressed as baseball fans. +A kid standing with a glove by a fence. +various food items in bowls on white counter +A bear is dressed dup to coordinate with the Christmas display behind him. +A pizza sitting inside of a small cardboard box. +Large white flowers are in a glass with paper tied around it. +An arrangement of a fruit skin and flowers is displayed. +A young lady riding skis down a snow covered slope. +Two brown teddy bears sitting side by side. +A toddler laying in a bed with their head on the pillow. +Guy poses for picture by kitchen sink with two dogs +an elephant playing with a piece of wood and a fence +A green stop light that is hanging above the street. +Several street signs posted together on a poll. +A man and woman standing in front of a cake. +A group of girls sitting on a towel under an umbrella. +A bathroom sink with lots of lights above it. +A kitchen has a stove with cupcakes on top of it. +A man is playing baseball before a crowd. +A woman riding skis down a ski slope holding ski poles. +Several horses hold their heads down toward water in a pond. +A motorcycle has three people riding on it. +Cross-country skiers are getting their workout for the day. +A professional tennis player exerts himself during a game +a living room with hardwood foors , a tv and table +A pizza covered in lots of cheese and toppings. +Three computer screens showing a panoramic beach scene. +A row of buses sitting next to each other in front of a tall building. +A person surfing in the middle of a wave. +There is an elephant standing near a tree. +A man riding a skateboard on a street. +A man taking a picture of himself in front of three huge beer bottles +A toilet setting in a bathroom with tile flooring. +A woman lays on her bed with her dog while using the mirror to take a picture. +Many motorcycles are parked around a crowd of people. +A basket filled with lots of fresh produce. +A banana and some fruit on a table. +this is a child spinning around a bat outside +large dog retrieving the frisbee for his owner +A group of men riding on the back of a boat. +A bathtub sitting under a chandelier and next to a pair of windows and a mirror in a bathroom. +A black-and-white photo of a girl with a teddy bear on her shoulder. +Cafe tables with table cloths and orange umbrellas over them. +A man that is standing in the grass near animals. +A restaurant table with a plate of vegetable pizza and garnishment +A group of people standing next to each other on snow. +People are hiding under colorful umbrellas on a rainy day. +A girl is holding her hand out with a kite in it as storm clouds hover over her. +A person that is swinging a baseball bat. +a bathroom that has a sink and a toilet in it +A water jug used as a vase to hold flowers +A white toilet seat in some lavatory somewhere. +Two giraffes eat from a pot attached to a fence. +A long green and yellow train traveling down tracks. +Oddly shaped homemade pizza about to be cut with pizza cutter +A dog chasing a white frisbee on a dirty and grass park. +A train pulling past a church with a large clock tower. +A black bear walking across a lush green field. +A group of people walking down a wet sidewalk. +A man riding a surfboard on top of a wave in the ocean. +TWO PIECES OF PIZZA BOTH DIFFERENT IN A BOX +A refrigerator that has many pictures and magnets on it. +An open refrigerator containing various fruits, vegetables and jars. +a big bus is parked as a some people ride by in a cart +Three surfers standing on their boards in a wave. +Large barrels of vegetables are in an open market. +This is a large fish that is in an aquarium. +Some type of colorful vegetable is wet and sitting on the counter +A jockey riding a horse as it leaps into the air. +Red and yellow fire hydrant with the lid off. +A woman on a surfboard riding a wave. +Close up of a traffic light with three lights, the top illuminated red with a person image, the second down not illuminated, and the bottom on hanging down. +A group of girls play frisbee outside together +A group of people riding swings in the snow. +A man holding a tennis racquet on top of a tennis court. +A woman in a wet suit surfs a wave. +Horse in fenced pasture with others grazing on grasses. +A painted vase is holding several colorful tulips. +Small home made pizza with olives and pepperoni. +A rusted out ship sitting on top of a body of water. +a trio of men playing with a frisbee on an open field +a man reaching into the refrigerator in a kitchen +A group of people standing in a room. +a man on skis stands on a snowy hill side +A little girl in a public bathroom for kids. +A clock that is perched on clothing to make it look like a head. +A yellow train parked at a train station next to a platform. +fifteen different varieties of doughnuts in a display case +Fans observing a baseball game in process. +A group of people on a field playing baseball. +A long boat filled with people sitting on top of a body of water. +A wooden cutting board topped with a pizza on a table. +A airplane that is sitting on a tarmac. +A laptop and some computers on a desk. +A cloth bag is on the keyboard of a laptop. +A house filled with windows and a platform. +A man water skiing behind a boat on the water +a chicken meal with carrots broccoli and rice +A woman sitting at a wooden table in front of an open laptop. +A man holding an apple between his fingers. +A photo of beer is cropped next to a photo of food. +A woman sits on a bed and holds a book. +A man is in a kayak in a pool with a ball. +A group of palm trees with lots of bananas hanging from them. +A plate of food with a sandwich and fries. +A game of Tennis being played on TV +a truck with a small cabin built on the back of it +A parked truck with an artistic design on its trailer. +A man carrying a blue piece of luggage near an escalator. +A large white clock tower sitting in the middle of a city. +A yellow food truck parked close to a car +A toilet and urinal side by side in a bathroom. +An adorable cat rolling around on it's back while holding a toothbrush in it's mouth. +People on a motorcycle holding a wrapped surfboard. +People preparing to ski down a snow covered slope. +Batter, catcher, and umpire are ready for the pitcher to throw the ball. +There is a horse drawn carriage on the side of the road. +A refrigerator filled with lots of soft drinks. +a large plane is docked at the airport and connected +A horse grazes on grass in the shadow of a mountain. +A woman tossing a frisbee while wearing a bikini top. +A small child touches the trunk of an elephant standing behind an enclosure. +A group of people holding cell phones and posing. +A clock displays the time outside of a bank. +A group of people standing on top of a beach near the ocean. +Very cute cat laying on couch holding the remote control +A large double decker bus driving under traffic lights. +Hauling items in a wagon is a lot easier than carrying them. +A person falling off of a surfboard while other surfers watch. +A black and white photo of a bicyclist. +A thing is in the outline and it shows up like something +A floor covered in food and office supplies. +A kitchen filled with a wooden cabinet and a large window. +A toilet and bidet sit in a bathroom that is under construction. +A cat sleeping on top of a stereo by a plastic clock +Many planes are parked on a plane run way +a big body of water with a freeze be next to it +A blue pot of tomato sauce with a wooden ladle. +A man is talking to a woman behind a podium. +A man and a dog on a boat in the water. +A zebra in captivity grazing in its exhibit. +Someone is on their motorcycle, in his gear. +A pizza that is sitting on a wooden board. +The cat is looking through the window at the animal eating the seed in the feeder. +People and their children are gathered outside meeting together. +Three demonic looking dummies standing next to each other on a snow covered slope. +Two pizzas covered in veggies sitting on a table. +A vase with flowers that is standing on a stand. +A man bending over in the woods with a frisbee in each hand. +Two stuffed animals sit at a table with honey. +A person on a street next to a motor bike. +A giraffe standing in dirt field next to trees. +Two buildings standing tall but the tallest one has a clock on it. +A fire hydrant stands alone in the middle of the concrete. +A young person in a hoodie with headphones in on a laptop. +A woman and a man walking down a street carrying luggage. +LOTS OF BACKPACKS AND HATS LINED UP ALONG A WALKWAY +A man serving a tennis ball on top of a tennis court. +An office desk holding multiple computers and various supplies. +A bathroom with a toilet and a couple towels. +A cat sitting on top of a blue piece of luggage. +A couple of people skateboarding in a graffiti filled area. +A wooden clock sitting up against a white wall. +A man riding a skateboard in a river bed. +A her of black and white cattle standing on a field. +The young girl runs toward the net to meet the tennis ball. +A woman in black shirt riding a skateboard. +A glass vase is holding a few colorful flowers. +A young boy standing in front of a parking meter. +A man on a tennis court at night time. +A black and white photo of people at a train station +A mirror reflecting a clock next to a shelf in a kitchen. +A man writing in a notebook with several partially filled wineglasses in front of him. +a small pepperoni pizza next to a fork +A modern jet airplane coming in to park at the gate of an airport +A zebra standing on top of a lush green field. +A young man swinging a baseball bat on a field. +A baseball player standing next to home plate. +a close up of many large kites near the ground +A stack of old trunks and luggage against a wall. +The bed covered in many sheets and blankets is the only furniture in the room. +A full view of a kitchen with many things to use. +A man flying through the air while riding a skateboard. +A young man standing next to a wall with a bird painted on it. +A person standing inside of a clock shop with lots of clocks. +An extravagant bedroom with focus on the chandelier. +A newly married couple standing in front of a cake. +The man is trying to surf the waves on the water. +A zebra that is bent over eating grass. +A walkway with groups of people flying kites beyond it. +a tennis player running to get to the ball +Two animals standing in the grass near trees. +A laptop computer sitting on top of a table. +A woman on a bike with a baby seat holding a dog leash. +A surfer riding their surfboard through waves in the ocean. +A group of parked bikes sitting on the side of a road. +a dining room table that is in a room +A blue motor scooter parked in front of a brick building. +The school bus is remodeled with bright paint. +A large propeller plane mounted to the ceiling of a building. +A man on a skateboard is holding on to string. +This couple is making a funny face while sitting on a couch +A woman and a man sitting on a couch next to each other. +A skateboarder does a trick in the middle of the street. +A man and a woman cutting up a small white cake. +Two guys are checking their messages on their phones. +The white polar bear is standing near a tree branch. +A woman exercising a brown horse in a riding ring. +The young man is eating a hamburger with a pizza folded into it. +A young man running towards a white Frisbee. +A couple of zebra standing next to each other. +some dessert is laying out on a yellow and white plate +A bathroom stall that has very dirty walls. +A zebra staring blankly near a watering hole. +A small boy holding a plate of tasty looking food. +A busy intersection in a city has many tall buildings and advertisements. +A boat is sitting in the water at a dock +A series of photographs depicting bathroom before and after minor changes. +A close up view of some very pretty pastries. +A bathroom has a shower and toilet in it. +A display case filled with lots of different kinds of donuts. +A boy is standing in an inflatable pool on a surfboard. +A tennis player is swinging to hit the ball. +A man standing next to a light and a sign. +A woman sitting on a bench holding a bag. +A bunch of white flowers is in a clear vase. +There is a tiny bird on the branch. +Small black and white pig on wheeled cart with protest sign. +A woman and two men are making pizzas around a table. +A woman cutting a sheet cake at a table. +Giraffe in its natural habitat snacking on a tree +A woman preparing to hit a tennis ball while a man watches. +Various street signs including one that reads "Newlon Hale Village." +A blonde lady holding a smart phone laughing. +A room with a refrigerator with a house plant sitting on top of it. +a red and white bus and some cars in the rain +Two goat kids and one young lamb are in a field. +A grey and white dog sitting in the passenger side of a car. +A bird is flying over some rocky water. +A woman stands on some grass, holding some frisbees. +A pretty young lady running towards a tennis ball while holding a tennis racquet. +Two individuals, each on a horse, leading four other horses. +A smaller car is stopped at the red light, as traffic drives on. +A man on skis skiing down a mountain slope. +Looking into the mirror of a residential bathroom +A silver train passing under high beams and trees. +A model of a red cow wearing a pink top hat outside of a building. +A herd of sheep walking across a snow covered field. +This lady is ready for her slice of cake. +A white kitchen with marble countertops and stone floors. +Two zebras, one walking toward the camera, one walking away. +A green and yellow railroad train pulling into the station +A watch set to 12:25 sits on the table. +A group of people is holding their cell phones up to the sky. +A close view of the round part of a tablespoon shows a wilted piece of lettuce resting on the spoon. +A blurry photo of a train riding along +No image is displaying to describe in this box. +A large white bed with a wooden headboard. +A herd of animals grazing on top of a grass field. +A slicing up a cake on top of a table with a knife. +Meat and food in between two slices of bread on a plate. +A crowd of people in a waiting area +The bird is perched alone on the bannister. +Two men playing Ultimate Frisbee, with one holding the disc upside down. +A kitchen with a large clock mounted to it's wall. +An elephant mother with a baby elephant beneath her reaches for branches with her trunk. +A red surfboard is sticking up from the ocean waves. +an image of a man in a boat with a dog +A large building with a tower and clock on top. +A woman riding a red surfboard waiting for a wave. +A man and a woman on a sidewalk standing in front of several suitcases. +A high speed train is seen in the station. +A man preparing food in a restaurant kitchen. +A sign on a pole signifying a no parking zone. +A giraffe leaned over a rail sipping some water. +A eggy casserole is shown from the inside. +A man on a surfboard rides on a wave. +A flat screen TV sitting on top of a TV stand. +A train is going down a hill through a town +A man smiles while holding his cell phone. +Professional baseball player at bat at citi stadium. +A clock with a base made of stones standing near several parked cars. +A bunch of very pretty umbrellas very close together. +Mother and son on the beach with surf board in the foreground. +A street scene with a pole with the clock on top. +A white refrigerator with a double doors and an ice-maker has papers, cards and magnets on the surface, +A man in red shorts plays tennis on a green court. +Two men play Wii sports in their living room. +A red and white bicycle next to a building and door. +A skier is taking a turn while surrounded by snowy hills. +A giraffe peeks from behind a low-hanging branch. +A bicycle locked up to a metal pole. +A green school bus drives down the road with bikes on top. +A white horse with its head sticking over a gate. +A brown teddy bear standing next to bottles of honey. +A boy and a man in batting cages. +A person getting ready to use an apple slicer +A child on a beach flying a kite. +a man jumping skis at a ski area +A photo of a kitchen with focus on the oven. +A dog with his leash attached to a bench +A bunch of bananas on a leafy stalk. +A table with a sandwich and two cups of coffee. +a dog by a couch in a living room +There is some tuna and a potato on a white plate. +A gang of bikers driving down a rural country road. +A large elephants reflection in a cars side view mirror, with other elephants in the distance. +A white cake with chocolate being poured over the top of it. +A small desk with lamp, phone, and laptop on it. +A bus parked in front of a bus stop. +A man wearing glasses and a stripped tie. +A young man poses with a surfboard next to water. +A bunch of kids skateboarding off of various objects in a skate park. \ No newline at end of file diff --git a/ppdiffusers/examples/pixart_quant/example/pixart/ptq.py b/ppdiffusers/examples/pixart_quant/example/pixart/ptq.py new file mode 100755 index 000000000..fb8582a5a --- /dev/null +++ b/ppdiffusers/examples/pixart_quant/example/pixart/ptq.py @@ -0,0 +1,153 @@ +import os +import sys +import time +import shutil +import argparse +import logging +import ppdiffusers + +import paddle +import paddle.nn as nn +from qdiff.utils import apply_func_to_submodules, seed_everything, setup_logging + +from models.customize_pixart_alpha_pipeline import CustomizePixArtAlphaPipeline +from models.customize_transformer_2d import CustomizeTransformer2DModel + +ppdiffusers.models.Transformer2DModel = CustomizeTransformer2DModel +ppdiffusers.PixArtAlphaPipeline = CustomizePixArtAlphaPipeline +from ppdiffusers import PixArtAlphaPipeline +from omegaconf import OmegaConf, ListConfig + +from qdiff.smooth_quant.sq_quant_layer import SQQuantizedLinear + +def main(args): + seed_everything(args.seed) + paddle.set_grad_enabled(False) + device = "gpu" if paddle.is_compiled_with_cuda() else "cpu" + + if args.log is not None: + if not os.path.exists(args.log): + os.makedirs(args.log) + log_file = os.path.join(args.log, 'run.log') + setup_logging(log_file) + logger = logging.getLogger(__name__) + + pipe = PixArtAlphaPipeline.from_pretrained("/mnt/public/wujunyi_tsinghua/huggingface_cache/hub/models--PixArt-alpha--PixArt-XL-2-1024-MS/snapshots/b89adadeccd9ead2adcb9fa2825d3fabec48d404", from_diffusers=True, from_hf_hub=True) + + # ---- assign quant configs ------ + quant_config = OmegaConf.load(args.quant_config) + pipe.convert_quant(quant_config) + pipe = pipe.to(dtype=paddle.float16).to(device) + model = pipe.transformer + + ''' + INFO: The PTQ process: + for simple PTQ with dynamic act quant: + the weight are quantized with quant_model initialization. + the act quant params are calculated online. + ''' + def init_sq_channel_mask_(module, full_name, calib_data, **kwargs): + """ + module: SQQuantizedLinear(Paddle 版本) + calib_data[full_name]: Tensor of shape [T, C] (Paddle Tensor) + """ + assert isinstance(module, SQQuantizedLinear) + # calib_data[full_name] 形状为 [T, C],按第0维取 max -> [C] + act_mask = paddle.max(calib_data[full_name], axis=0) + module.get_channel_mask(act_mask) # 设置 module.channel_mask + module.update_quantized_weight_scaled() + + + def init_rotation_matrix_(module, full_name): + """ + module: QuarotQuantizedLinear(Paddle 版本) + 这里保持对外部工具函数的导入不变,假定在 Paddle 库中也有相应实现。 + """ + # 若这些类在你的环境中是 paddle 实现,断言有效;否则调整为类名检查或移除断言 + assert isinstance(module, QuarotQuantizedLinear) + from qdiff.quarot.quarot_utils import random_hadamard_matrix, matmul_hadU_cuda + module.get_rotation_matrix() + module.update_quantized_weight_rotated() + + + def init_rotation_and_channel_mask_(module, full_name, calib_data): + """ + module: ViDiTQuantizedLinear(Paddle 版本) + 先基于 calib_data 计算 act mask,再计算 rotation,并更新量化权重。 + """ + assert isinstance(module, ViDiTQuantizedLinear) + act_mask = paddle.max(calib_data[full_name], axis=0) + module.get_channel_mask(act_mask) + module.get_rotation_matrix() + module.update_quantized_weight_rotated_and_scaled() + + ''' + INFO: the smooth_quant quantization. + load act channel mask from the calib data + ''' + if quant_config.get("smooth_quant",None) is not None: + # INFO: the SQQuantizedLayer are initialized with the quant_layer_refactor_ in quant_dit.py + from qdiff.smooth_quant.sq_quant_layer import SQQuantizedLinear + + assert quant_config.calib_data.save_path is not None + calib_path = os.path.join(args.log, quant_config.calib_data.save_path) + calib_data = paddle.load(calib_path) + + # get the channel mask, iter through all layers + kwargs = {} + apply_func_to_submodules(model, + class_type=SQQuantizedLinear, # add hook to all objects of this cls + function=init_sq_channel_mask_, + calib_data = calib_data, + full_name='', + **kwargs + ) + + ''' + INFO: the quarot quantization. + init and apply the rotation matrix + ''' + if quant_config.get("quarot",None) is not None: + + from qdiff.quarot.quarot_quant_layer import QuarotQuantizedLinear + # get the rotation matrix, iter through all layers + kwargs = {} + apply_func_to_submodules(model, + class_type=QuarotQuantizedLinear, # add hook to all objects of this cls + function=init_rotation_matrix_, + full_name='', + **kwargs + ) + ''' + INFO: combining both + ''' + if quant_config.get("viditq",None) is not None: + from qdiff.viditq.viditq_quant_layer import ViDiTQuantizedLinear + + assert quant_config.calib_data.save_path is not None + calib_data = torch.load(os.path.join(args.log, quant_config.calib_data.save_path), weights_only=True) # default wtih + kwargs = {} + apply_func_to_submodules(model, + class_type=ViDiTQuantizedLinear, # add hook to all objects of this cls + function=init_rotation_and_channel_mask_, + full_name='', + calib_data = calib_data, + **kwargs + ) + + model.set_init_done() + model.save_quant_param_dict() + paddle.save(pipe.transformer.quant_param_dict, os.path.join(args.log, 'quant_params.pth')) + logger.info(f'saved quant params into {args.log}') + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--log", type=str) + parser.add_argument('--quant-config', required=True, type=str) + parser.add_argument("--cfg-scale", type=float, default=4.0) + parser.add_argument("--num-sampling-steps", type=int, default=10) + parser.add_argument("--prompt", type=str, default=None) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--ckpt", type=str, default=None) + args = parser.parse_args() + main(args) diff --git a/ppdiffusers/examples/pixart_quant/example/pixart/quant_inference.py b/ppdiffusers/examples/pixart_quant/example/pixart/quant_inference.py new file mode 100755 index 000000000..f04601e3c --- /dev/null +++ b/ppdiffusers/examples/pixart_quant/example/pixart/quant_inference.py @@ -0,0 +1,84 @@ +import os +import sys +import time +import shutil +import argparse +import logging +import ppdiffusers + +import paddle +from qdiff.utils import apply_func_to_submodules, seed_everything, setup_logging + +from models.customize_pixart_alpha_pipeline import CustomizePixArtAlphaPipeline +from models.customize_transformer_2d import CustomizeTransformer2DModel + +ppdiffusers.models.Transformer2DModel = CustomizeTransformer2DModel +ppdiffusers.PixArtAlphaPipeline = CustomizePixArtAlphaPipeline +from ppdiffusers import PixArtAlphaPipeline +from omegaconf import OmegaConf, ListConfig + +def main(args): + seed_everything(args.seed) + paddle.set_grad_enabled(False) + device = "gpu" if paddle.is_compiled_with_cuda() else "cpu" + + if args.log is not None: + if not os.path.exists(args.log): + os.makedirs(args.log) + log_file = os.path.join(args.log, 'run.log') + setup_logging(log_file) + logger = logging.getLogger(__name__) + + pipe = PixArtAlphaPipeline.from_pretrained("/mnt/public/wujunyi_tsinghua/huggingface_cache/hub/models--PixArt-alpha--PixArt-XL-2-1024-MS/snapshots/b89adadeccd9ead2adcb9fa2825d3fabec48d404", from_diffusers=True, from_hf_hub=True) + + # ---- assign quant configs ------ + quant_config = OmegaConf.load(args.quant_config) + #pipe.convert_quant(quant_config) + pipe = pipe.to(dtype=paddle.float16).to(device) + #quant_param_ckpt = paddle.load(os.path.join(args.log, args.quant_param_ckpt)) + + model = pipe.transformer + #model.load_quant_param_dict(quant_param_ckpt) + + + logger.info(str(model)) + + # read the promts + prompt_path = args.prompt if args.prompt is not None else "./prompts.txt" + prompts = [] + with open(prompt_path, 'r') as f: + lines = f.readlines() + for line in lines: + prompts.append(line.strip()) + + N_batch = len(prompts) // args.batch_size # drop_last + for i in range(N_batch): + images = pipe( + prompt=prompts[i*args.batch_size: (i+1)*args.batch_size], + num_inference_steps=args.num_sampling_steps + ).images + print(f"Export image of batch {i}") + + save_path = os.path.join(args.log, "generated_images") + if not os.path.exists(save_path): + os.makedirs(save_path) + + for i_image in range(args.batch_size): + images[i_image].save(os.path.join(save_path, f"output_{i_image + args.batch_size*i}.jpg")) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--log", type=str) + parser.add_argument('--quant-config', required=True, type=str) + parser.add_argument("--quant_param_ckpt", type=str, default="./quant_params.pth") + parser.add_argument("--cfg-scale", type=float, default=4.0) + parser.add_argument("--num-sampling-steps", type=int, default=20) + parser.add_argument("--prompt", type=str, default=None) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--hardware", action='store_true', help='whether to use_cuda_kernel') + parser.add_argument("--profile", action='store_true', help='profile mode, measure the e2e latency') + parser.add_argument("--quant_weight_ckpt", type=str, default=None) + parser.add_argument("--batch-size", type=int, default=1) + parser.add_argument("--ckpt", type=str, default=None) + args = parser.parse_args() + main(args) diff --git a/ppdiffusers/examples/pixart_quant/example/pixart/samples_16.txt b/ppdiffusers/examples/pixart_quant/example/pixart/samples_16.txt new file mode 100644 index 000000000..eb0cc1803 --- /dev/null +++ b/ppdiffusers/examples/pixart_quant/example/pixart/samples_16.txt @@ -0,0 +1,16 @@ +A small cactus with a happy face in the Sahara desert. +Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, intricate detail. +beautiful lady, freckles, big smile, blue eyes, short ginger hair, dark makeup, wearing a floral blue vest top, soft light, dark grey background +stars, water, brilliantly, gorgeous large scale scene, a little girl, in the style of dreamy realism, light gold and amber, blue and pink, brilliantly illuminated in the background. +nature vs human nature, surreal, UHD, 8k, hyper details, rich colors, photograph. +Spectacular Tiny World in the Transparent Jar On the Table, interior of the Great Hall, Elaborate, Carved Architecture, Anatomy, Symetrical, Geometric and Parameteric Details, Precision Flat line Details, Pattern, Dark fantasy, Dark errie mood and ineffably mysterious mood, Technical design, Intricate Ultra Detail, Ornate Detail, Stylized and Futuristic and Biomorphic Details, Architectural Concept, Low contrast Details, Cinematic Lighting, 8k, by moebius, Fullshot, Epic, Fullshot, Octane render, Unreal ,Photorealistic, Hyperrealism +anthropomorphic profile of the white snow owl Crystal priestess , art deco painting, pretty and expressive eyes, ornate costume, mythical, ethereal, intricate, elaborate, hyperrealism, hyper detailed, 3D, 8K, Ultra Realistic, high octane, ultra resolution, amazing detail, perfection, In frame, photorealistic, cinematic lighting, visual clarity, shading , Lumen Reflections, Super-Resolution, gigapixel, color grading, retouch, enhanced, PBR, Blender, V-ray, Procreate, zBrush, Unreal Engine 5, cinematic, volumetric, dramatic, neon lighting, wide angle lens ,no digital painting blur +The parametric hotel lobby is a sleek and modern space with plenty of natural light. The lobby is spacious and open with a variety of seating options. The front desk is a sleek white counter with a parametric design. The walls are a light blue color with parametric patterns. The floor is a light wood color with a parametric design. There are plenty of plants and flowers throughout the space. The overall effect is a calm and relaxing space. occlusion, moody, sunset, concept art, octane rendering, 8k, highly detailed, concept art, highly detailed, beautiful scenery, cinematic, beautiful light, hyperreal, octane render, hdr, long exposure, 8K, realistic, fog, moody, fire and explosions, smoke, 50mm f2.8 +Bright scene, aerial view, ancient city, fantasy, gorgeous light, mirror reflection, high detail, wide angle lens. +8k uhd A man looks up at the starry sky, lonely and ethereal, Minimalism, Chaotic composition Op Art +A middle-aged woman of Asian descent, her dark hair streaked with silver, appears fractured and splintered, intricately embedded within a sea of broken porcelain. The porcelain glistens with splatter paint patterns in a harmonious blend of glossy and matte blues, greens, oranges, and reds, capturing her dance in a surreal juxtaposition of movement and stillness. Her skin tone, a light hue like the porcelain, adds an almost mystical quality to her form. +A 4k dslr image of a lemur wearing a red magician hat and a blue coat performing magic tricks with cards in a garden. +A alpaca made of colorful building blocks, cyberpunk +A baby painter trying to draw very simple picture, white background +A boy and a girl fall in love +A dog that has been meditating all the time \ No newline at end of file diff --git a/ppdiffusers/examples/pixart_quant/quant_utils/pyproject.toml b/ppdiffusers/examples/pixart_quant/quant_utils/pyproject.toml new file mode 100644 index 000000000..8fe2f47af --- /dev/null +++ b/ppdiffusers/examples/pixart_quant/quant_utils/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools>=42", "wheel"] +build-backend = "setuptools.build_meta" diff --git a/ppdiffusers/examples/pixart_quant/quant_utils/qdiff/__init__.py b/ppdiffusers/examples/pixart_quant/quant_utils/qdiff/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ppdiffusers/examples/pixart_quant/quant_utils/qdiff/base/__init__.py b/ppdiffusers/examples/pixart_quant/quant_utils/qdiff/base/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ppdiffusers/examples/pixart_quant/quant_utils/qdiff/base/base_quantizer.py b/ppdiffusers/examples/pixart_quant/quant_utils/qdiff/base/base_quantizer.py new file mode 100644 index 000000000..4e4b8824f --- /dev/null +++ b/ppdiffusers/examples/pixart_quant/quant_utils/qdiff/base/base_quantizer.py @@ -0,0 +1,235 @@ +import logging +import warnings +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from typing import Union +import time +import math +from omegaconf import ListConfig + +logger = logging.getLogger(__name__) + +class BaseQuantizer(nn.Layer): + + def __init__(self, quant_config): + super(BaseQuantizer, self).__init__() + + # unpack the quant configurations + self.n_bits = quant_config['n_bits'] + # self.group = quant_config['group'] + self.sym = quant_config.get('sym', False) + + if isinstance(self.n_bits, list): + raise AssertionError("when multiple n_bits are adopted, use the MixedPrecisionBaseQuantizer") + # assert self.group in ['token','tensor','channel'] + + # Paddle doesn't require register_buffer like PyTorch for simple use: + # we'll keep them as attributes and set them to paddle.Tensor when available. + self.delta = None + self.zero_point = None + + # INFO: for mixed_precision, the n_bits could be a ListConfig, and need to be initialized in subclass init + if not isinstance(self.n_bits, ListConfig): + # note: for symmetric case we use slightly different formula same as your original + self.n_levels = 2 ** self.n_bits if not self.sym else 2 ** (self.n_bits - 1) - 1 + + self.init_done = False + # module_name used in logger messages in original; keep it to avoid attribute error + self.module_name = self.__class__.__name__ + + def forward(self, x: paddle.Tensor): + raise NotImplementedError("should be implemented in subclass.") + + def init_quant_params(self, x): + raise NotImplementedError("should be implemented in subclass.") + + +class StaticQuantizer(BaseQuantizer): + """ + the input shape should be [Group,-1] + store the quant params (delta, zp) offline with init_quant_params + """ + + def __init__(self, quant_config): + super().__init__(quant_config) + + if self.sym: + self.x_absmax = None + else: + self.x_max = None + self.x_min = None + + def forward(self, x: paddle.Tensor): + x_quant = self.quantize(x) + # ensure delta and zero_point are tensors + x_dequant = (x_quant + self.zero_point) * self.delta + return x_dequant + + def quantize(self, x: paddle.Tensor): + + if self.init_done is not True: # set as True externally when done + self.init_quant_params(x) + # x_int = round(x / delta) - zero_point + x_int = paddle.round(x / self.delta) - self.zero_point + # clamp: note paddle.clip takes min and max scalars or tensors broadcastable + x_quant = paddle.clip(x_int, min=-self.n_levels - 1, max=self.n_levels) + return x_quant + + def init_quant_params(self, x: paddle.Tensor): + + assert len(x.shape) == 2 # [N_group, -1] + if self.sym: + # x_absmax per group + x_absmax = paddle.max(paddle.abs(x), axis=1) + # update stored x_absmax + if self.x_absmax is not None: + try: + self.x_absmax = paddle.maximum(self.x_absmax, x_absmax) + except Exception: + # if devices differ, user should ensure consistent device or convert + self.x_absmax = paddle.maximum(self.x_absmax, x_absmax) + else: + self.x_absmax = x_absmax + delta = x_absmax / self.n_levels + zero_point = paddle.zeros_like(delta) + else: + x_max = paddle.max(x, axis=1) + # set negative maxima to 0 + x_max = paddle.where(x_max < 0., paddle.zeros_like(x_max), x_max) + + if self.x_max is not None: + try: + self.x_max = paddle.maximum(self.x_max, x_max) + except Exception: + # device mismatch handling: convert if necessary (user may need to ensure devices) + self.x_max = paddle.maximum(self.x_max, x_max) + else: + self.x_max = x_max + + x_min = paddle.min(x, axis=1) + x_min = paddle.where(x_min > 0., paddle.zeros_like(x_min), x_min) + if self.x_min is not None: + try: + self.x_min = paddle.minimum(self.x_min, x_min) + except Exception: + self.x_min = paddle.minimum(self.x_min, x_min) + else: + self.x_min = x_min + + delta = (x_max - x_min) / (self.n_levels - 1) + # zero_point formula preserved + zero_point = paddle.round(x_min / delta) + (self.n_levels / 2) + + try: + # use paddle.logical_and to check > eps for all elements + assert bool(paddle.all(delta > 1.e-6)) + except Exception as e: + # drop into debugger equivalently, here we just raise for visibility + # If you want interactive debugging, you can import ipdb and set_trace here like original. + raise AssertionError("unexpected small delta exists") from e + + # unsqueeze last dim + self.delta = paddle.unsqueeze(delta, axis=-1) # [G] -> [G,1] + self.zero_point = paddle.unsqueeze(zero_point, axis=-1) + + +class DynamicQuantizer(BaseQuantizer): + """ + the input shape should be [Group,-1] + compute quant params on-the-fly + """ + + def __init__(self, quant_config): + super().__init__(quant_config) + + def quantize(self, x: paddle.Tensor): + # get the quant_params online + assert len(x.shape) == 2 # [N_group, -1] + assert int(paddle.sum(paddle.isnan(x)).numpy()) == 0 # no nan exists + + if self.sym: + x_absmax = paddle.max(paddle.abs(x), axis=1) + self.x_absmax = x_absmax + + delta = x_absmax / self.n_levels + zero_point = paddle.zeros_like(delta) + + eps = 1.e-6 + try: + assert bool(paddle.all(paddle.abs(delta) > eps)) + except Exception: + # fallback: set small delta to eps + delta = paddle.where(paddle.abs(delta) < eps, paddle.full_like(delta, eps), delta) + logger.info("unexpected small delta: {:.3e} exists in {}, set as eps".format(float(paddle.min(paddle.abs(delta)).numpy()), self.module_name)) + + else: + x_max = paddle.max(x, axis=1) + x_max = paddle.where(x_max < 0., paddle.zeros_like(x_max), x_max) + self.x_max = x_max + + x_min = paddle.min(x, axis=1) + x_min = paddle.where(x_min > 0., paddle.zeros_like(x_min), x_min) + self.x_min = x_min + + delta = (x_max - x_min) / (self.n_levels - 1) + # INFO: check small values for delta + eps = 1.e-8 + try: + assert bool(paddle.all(paddle.abs(delta) > eps)) + except Exception: + # fallback: set values smaller than eps to eps + delta = paddle.where(paddle.abs(delta) < eps, paddle.full_like(delta, eps), delta) + logger.info("unexpected small delta: {:.3e} exists in {}, set as eps".format(float(paddle.min(paddle.abs(delta)).numpy()), self.module_name)) + zero_point = paddle.round(x_min / delta) + (self.n_levels / 2) + + self.delta = paddle.unsqueeze(delta, axis=-1) # [G] -> [G,1] + self.zero_point = paddle.unsqueeze(zero_point, axis=-1) + + # quantize model with quant params + x_int = paddle.round(x / self.delta) - self.zero_point + x_quant = paddle.clip(x_int, min=-self.n_levels - 1, max=self.n_levels) + return x_quant + + def forward(self, x: paddle.Tensor): + x_quant = self.quantize(x) + x_dequant = (x_quant + self.zero_point) * self.delta + return x_dequant + +if __name__ == '__main__': + paddle.set_device('cpu') # 你也可以改成 'gpu' 测试 + + # 构造一个量化配置 + quant_config = { + 'n_bits': 8, + 'sym': True + } + + # 测试 StaticQuantizer + print("==== StaticQuantizer Test ====") + static_quantizer = StaticQuantizer(quant_config) + + # 构造一个 [Group, Feature] 的张量,比如 4 组,每组 16 个元素 + x = paddle.randn([4, 16], dtype='float32') + + # 第一次 forward 时会调用 init_quant_params + y_static = static_quantizer(x) + print("Input shape:", x.shape) + print("Output shape (Static):", y_static.shape) + print("Delta shape:", static_quantizer.delta.shape) + print("Zero point shape:", static_quantizer.zero_point.shape) + print("Sample output (Static):", y_static[0, :5]) + + # 测试 DynamicQuantizer + print("\n==== DynamicQuantizer Test ====") + dynamic_quantizer = DynamicQuantizer(quant_config) + + y_dynamic = dynamic_quantizer(x) + print("Input shape:", x.shape) + print("Output shape (Dynamic):", y_dynamic.shape) + print("Delta shape:", dynamic_quantizer.delta.shape) + print("Zero point shape:", dynamic_quantizer.zero_point.shape) + print("Sample output (Dynamic):", y_dynamic[0, :5]) + + print("\n✅ Paddle Quantizer forward 测试完成!") + print(x[0,:5]) diff --git a/ppdiffusers/examples/pixart_quant/quant_utils/qdiff/base/quant_layer.py b/ppdiffusers/examples/pixart_quant/quant_utils/qdiff/base/quant_layer.py new file mode 100644 index 000000000..5e2f0c9a2 --- /dev/null +++ b/ppdiffusers/examples/pixart_quant/quant_utils/qdiff/base/quant_layer.py @@ -0,0 +1,168 @@ +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from qdiff.base.base_quantizer import StaticQuantizer, DynamicQuantizer + +from omegaconf import ListConfig + + + +class QuantizedLinear(paddle.nn.Linear): + """ + Paddle 版本的 QuantizedLinear + - static weight quantization (w_quantizer) + - dynamic activation quantization (a_quantizer) + """ + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + device: None = None, # kept for API parity (ignored by Paddle here) + quant_config: dict = None, + fp_module: paddle.nn.Linear = None, + ) -> None: + # Paddle Linear: bias_attr expects bool or ParamAttr + super().__init__(in_features, out_features, bias_attr=bias) + + self.fp_module = fp_module + self.q_cfg = quant_config or {} + + # set default as None, to skip quant some part + self.w_quantizer = None + self.a_quantizer = None + + # weight quantizer init + if self.q_cfg.get('weight', None) is not None: + weight_cfg = self.q_cfg['weight'] + # detect ListConfig whether mixed-precision + n_bits_attr = getattr(weight_cfg, 'n_bits', None) + self.w_quantizer = StaticQuantizer(weight_cfg) + + # quantize the weight from FP module, bias remain as float + if self.fp_module is None: + raise ValueError("fp_module must be provided when weight quantization is enabled.") + # assume fp_module.weight is a paddle.Tensor/Parameter + # w_quantizer returns a tensor (dequantized) — set as this module's weight + w_q = self.w_quantizer(self.fp_module.weight.t()).t() # expected shape [out_features, in_features] + # ensure shape matches + if tuple(w_q.shape) != tuple(self.weight.shape): + raise RuntimeError(f"quantized weight shape {w_q.shape} != target weight shape {tuple(self.weight.shape)}") + # assign value to the Parameter + self.weight.set_value(w_q) + # mark quant init done (mirrors original behavior) + self.w_quantizer.init_done = True + else: + # copy fp weight to this layer + if self.fp_module is None: + # keep default random init if no fp_module provided + pass + else: + if tuple(self.fp_module.weight.shape) != tuple(self.weight.shape): + raise RuntimeError("fp_module.weight shape mismatch") + self.weight.set_value(self.fp_module.weight) + + # save references to fp weight and bias + self.fp_weight = self.fp_module.weight if self.fp_module is not None else None + # assign bias value from fp_module if provided and bias exists + if self.fp_module is not None and self.fp_module.bias is not None: + # paddle Linear 'bias' is a Parameter if bias_attr True + if self.bias is not None: + self.bias.set_value(self.fp_module.bias) + # else keep default + + # activation quantizer init + if self.q_cfg.get('act', None) is not None: + act_cfg = self.q_cfg['act'] + act_n_bits = getattr(act_cfg, 'n_bits', None) + self.a_quantizer = DynamicQuantizer(act_cfg) + + self.use_kernel = False # whether use the cuda kernel for actual saving (same flag name) + self.quant_mode = True # when set as False, use the original model forward + + def forward(self, x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor: + """ + input shape: [B, N_token, C] (C == in_features) + behavior: + if not self.quant_mode: call fp_module (if provided) + else: reshape to [B*N_token, -1], quantize activation (if any), + reshape back and apply linear with quantized weight/bias + """ + if not self.quant_mode: + # use the FP module if provided + if self.fp_module is not None: + return self.fp_module(x, *args, **kwargs) + else: + # fallback to parent Linear forward + return super().forward(x) + else: + # ensure x has three dims + if x.ndim != 3: + raise ValueError("Expected x shape [B, N_token, C] for QuantizedLinear forward.") + B, N_token, C = x.shape + # reshape to [B*N_token, C] + x_reshaped = paddle.reshape(x, [B * N_token, -1]) + + # quantize activation if present (DynamicQuantizer expects [G, -1] in original design) + if self.a_quantizer is not None: + x_q = self.a_quantizer(x_reshaped) + else: + x_q = x_reshaped + + # reshape back to [B, N_token, C] + x_back = paddle.reshape(x_q, [B, N_token, C]) + + # perform linear using stored (quantized) weight and bias + # paddle.nn.functional.linear works like torch.nn.functional.linear + y = F.linear(x_back, self.weight, self.bias) + return y + + +if __name__ == "__main__": + # quick test for the Paddle QuantizedLinear + paddle.set_device('gpu') # 改成 'gpu' 如果你要在 GPU 上测试 + + # prepare a FP linear as fp_module + in_features = 16 + out_features = 8 + fp_linear = paddle.nn.Linear(in_features, out_features, bias_attr=True) + # init deterministic for testing + paddle.seed(42) + + # simple quant_config example (symmetric 8-bit for both weight and act) + quant_config = { + 'weight': { + 'n_bits': 8, + 'sym': True + }, + 'act': { + 'n_bits': 8, + 'sym': True + } + } + + # NOTE: If your quantizers expect OmegaConf nodes, wrap dict into an object/Namespace or adjust above logic. + # For this test we'll assume dict works and quantizer constructors accept such dict-like config. + + # create QuantizedLinear (Paddle) + qlinear = QuantizedLinear(in_features, out_features, bias=True, device=None, quant_config=quant_config, fp_module=fp_linear) + + # build a dummy input [B, N_token, C] + B = 2 + N_token = 3 + C = in_features + x = paddle.randn([B, N_token, C], dtype='float32') + + # forward in quant mode + qlinear.quant_mode = True + y_q = qlinear(x) + print("Quant mode output shape:", y_q.shape) + print("Sample output (quant):", y_q.flatten()[:6].numpy()) + + # forward in FP mode (use fp_module) + qlinear.quant_mode = False + y_fp = qlinear(x) + print("FP mode output shape:", y_fp.shape) + print("Sample output (fp):", y_fp.flatten()[:6].numpy()) + + print("Done.") \ No newline at end of file diff --git a/ppdiffusers/examples/pixart_quant/quant_utils/qdiff/base/quant_model.py b/ppdiffusers/examples/pixart_quant/quant_utils/qdiff/base/quant_model.py new file mode 100644 index 000000000..4a16a32f3 --- /dev/null +++ b/ppdiffusers/examples/pixart_quant/quant_utils/qdiff/base/quant_model.py @@ -0,0 +1,355 @@ +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from qdiff.base.base_quantizer import BaseQuantizer, StaticQuantizer, DynamicQuantizer +from qdiff.base.quant_layer import QuantizedLinear +from qdiff.utils import apply_func_to_submodules + +# optional alternative quant-layer implementations (assume Paddle versions exist) +# from qdiff.smooth_quant.sq_quant_layer import SQQuantizedLinear +# from qdiff.quarot.quarot_quant_layer import QuarotQuantizedLinear +# from qdiff.viditq.viditq_quant_layer import ViDiTQuantizedLinear + +import logging +logger = logging.getLogger(__name__) + + +def quant_layer_refactor_(submodule, name, parent_module, quant_config=None, full_name=None, remain_fp_regex=None): + """ + Replace a nn.Linear submodule with a QuantizedLinear (or variants) according to quant_config and regex. + This function is intended to be called by apply_func_to_submodules; the kwargs it expects are provided there. + """ + quant_layer_type = QuantizedLinear + + # METHOD: smooth_quant + if quant_config is not None and quant_config.get("smooth_quant", None) is not None: + from qdiff.smooth_quant.sq_quant_layer import SQQuantizedLinear + import re + layer_regex = quant_config.smooth_quant.layer_name_regex + match = re.search(re.compile(layer_regex), full_name) + if match: + quant_layer_type = SQQuantizedLinear + logger.info('[INFO] setting smooth quant for layer {}'.format(full_name)) + + # METHOD: quarot + if quant_config is not None and quant_config.get("quarot", None) is not None: + from qdiff.quarot.quarot_quant_layer import QuarotQuantizedLinear + import re + layer_regex = quant_config.quarot.layer_name_regex + match = re.search(re.compile(layer_regex), full_name) + if match: + quant_layer_type = QuarotQuantizedLinear + logger.info('setting quarot for layer {}'.format(full_name)) + + # METHOD: viditq - quarot + smooth_quant (both used) + if quant_config is not None and quant_config.get("viditq", None) is not None: + from qdiff.viditq.viditq_quant_layer import ViDiTQuantizedLinear + import re + layer_regex = quant_config.viditq.layer_name_regex + match = re.search(re.compile(layer_regex), full_name) + if match: + quant_layer_type = ViDiTQuantizedLinear + logger.info('setting viditq for layer {}'.format(full_name)) + + # set some layers as FP (fixed), feed in from config + if remain_fp_regex is not None: + import re + pattern = re.compile(remain_fp_regex) + if pattern.search(full_name): + logger.info(f"remain {full_name} quant as FP due to fp_regex") + return + + # derive in/out features from the Linear weight shape + # Paddle Linear weight shape: [in_features, out_features] + w_shape = list(submodule.weight.shape) + if len(w_shape) != 2: + raise RuntimeError(f"Unexpected weight shape for Linear layer {full_name}: {w_shape}") + out_features, in_features = int(w_shape[1]), int(w_shape[0]) + bias = True if getattr(submodule, 'bias', None) is not None else False + + # device placeholder: Paddle doesn't accept device in Linear constructor; keep for API parity + device = paddle.get_device() if paddle.is_compiled_with_cuda() else paddle.get_device() + + # create the quantized layer and replace in parent module + quant_layer = quant_layer_type(in_features, out_features, bias, device, quant_config, submodule) + setattr(parent_module, name, quant_layer) + + # set the module_name for quant_layer and its quantizers + setattr(getattr(parent_module, name), 'module_name', full_name) + if getattr(parent_module, name).w_quantizer is not None: + setattr(getattr(parent_module, name).w_quantizer, 'module_name', full_name) + if getattr(parent_module, name).a_quantizer is not None: + setattr(getattr(parent_module, name).a_quantizer, 'module_name', full_name) + + +def bitwidth_refactor_(submodule, name=None, parent_module=None, quant_config=None, full_name=None): + """ + Set mixed-precision bitwidths for matched layers according to regex lists in quant_config. + Expects quant_config.mixed_precision.weight.layer_name_regex and .act.layer_name_regex to be iterable lists. + """ + import re + + if quant_config is None: + return + + layer_regex_list_w = quant_config.mixed_precision.weight.layer_name_regex + layer_regex_list_a = quant_config.mixed_precision.act.layer_name_regex + + # Weight bitwidth refactor + for idx, layer_regex in enumerate(layer_regex_list_w): + if len(layer_regex) == 0: # skip empty regex entries + continue + match = re.search(re.compile(layer_regex), full_name) + if match: + if idx == 0: # FP16 (or FP) + submodule.quant_mode = False + logger.info(f'[Mixed Precision] set the {full_name} W as FP16') + else: + # idx-1 maps to index in bitwidth list inside quantizer + submodule.w_quantizer.bitwidth_refactor(idx - 1) + logger.info(f'[Mixed Precision] set the {full_name} W as {submodule.w_quantizer.bitwidth_list[idx-1]} bit') + + # Activation bitwidth refactor + for idx, layer_regex in enumerate(layer_regex_list_a): + if len(layer_regex) == 0: + continue + match = re.search(re.compile(layer_regex), full_name) + if match: + if idx == 0: + submodule.quant_mode = False + logger.info(f'[Mixed Precision] set the {full_name} A as FP16') + else: + submodule.a_quantizer.bitwidth_refactor(idx - 1) + logger.info(f'[Mixed Precision] set the {full_name} A as {submodule.a_quantizer.bitwidth_list[idx-1]} bit') + + +def load_quant_param_dict_(submodule, full_name=None, parent_module=None, quant_param_dict=None, model=None, **kwargs): + """ + Load delta/zero_point (and other params) from quant_param_dict into quantizers/submodules. + """ + if quant_param_dict is None or full_name not in quant_param_dict: + return + + submodule.delta = quant_param_dict[full_name]['delta'] + submodule.zero_point = quant_param_dict[full_name]['zero_point'] + + # reinit the rotation_matrix/channel_mask for special quant methods (viditq/quarot/sq) + # Because these classes are not always imported here, guard imports and attribute checks + if hasattr(parent_module, 'channel_mask') and hasattr(parent_module, 'rotation_matrix'): + # ViDiTQuantizedLinear expected + # ensure parent_module has expected methods; call to re-compute rotation matrix + parent_module.get_rotation_matrix() + parent_module.channel_mask = quant_param_dict[full_name]['channel_mask'] + parent_module.update_quantized_weight_rotated_and_scaled() + elif not hasattr(parent_module, 'channel_mask') and hasattr(parent_module, 'rotation_matrix'): + # QuarotQuantizedLinear expected + parent_module.get_rotation_matrix() + parent_module.update_quantized_weight_rotated() + elif hasattr(parent_module, 'channel_mask') and not hasattr(parent_module, 'rotation_matrix'): + # SQQuantizedLinear expected + parent_module.channel_mask = quant_param_dict[full_name]['channel_mask'] + parent_module.update_quantized_weight_scaled() + + # update the quant_model.quant_param_dict also + if model is not None: + model.quant_param_dict[full_name] = quant_param_dict[full_name] + + +def save_quant_param_dict_(submodule, full_name=None, parent_module=None, model=None, **kwargs): + """ + Save delta/zero_point (and other small per-layer values) into model.quant_param_dict. + """ + if model is None: + return + + model.quant_param_dict[full_name] = {} + model.quant_param_dict[full_name]['delta'] = submodule.delta + model.quant_param_dict[full_name]['zero_point'] = submodule.zero_point + + # parent module: the quant_layer (may have channel_mask or rotation_matrix) + if hasattr(parent_module, 'channel_mask'): + model.quant_param_dict[full_name]['channel_mask'] = parent_module.channel_mask + if hasattr(parent_module, 'rotation_matrix'): + # skip saving large rotation_matrix to reduce size (original code set None) + model.quant_param_dict[full_name]['rotation_matrix'] = None + + +def set_init_done_(submodule, **kwargs): + """Mark quantizer init_done flag.""" + submodule.init_done = True + + +''' +IMPORTANT: this file is simply a template, you should inherit the model you are using +and implement these functions. +ref the examples in `examples/dit/models/quant_dit.py` +''' +class QuantModel(nn.Layer): + """ + the base quant model (Paddle) + specialized funcs should be implemented in subclass. + (e.g., QuantizedOpenSORA...) + """ + def __init__(self, quant_config: dict = None, **kwargs) -> None: + super().__init__() # initialize all attributes from parent class + + # additional attributes for quant + self.q_cfg = quant_config or {} + self.quant_param_dict = {} + + # refactor layers with quant_layers based on q_cfg + self.quant_layer_refactor() + + def quant_layer_refactor(self): + # pass quant_config so the callback has access to it + apply_func_to_submodules( + self, + class_type=nn.Linear, + function=quant_layer_refactor_, + quant_config=self.q_cfg + ) + + def save_quant_params_dict(self): + apply_func_to_submodules( + self, + class_type=BaseQuantizer, + function=save_quant_param_dict_, + model=self + ) + + def load_quant_params_dict(self, quant_param_dict): + apply_func_to_submodules( + self, + class_type=BaseQuantizer, + function=load_quant_param_dict_, + quant_param_dict=quant_param_dict, + model=self + ) + + def set_init_done(self): + apply_func_to_submodules( + self, + class_type=BaseQuantizer, + function=set_init_done_, + ) + + def bitwidth_refactor(self): + apply_func_to_submodules( + self, + class_type=QuantizedLinear, + function=bitwidth_refactor_, + quant_config=self.q_cfg + ) + + def forward(self, x, *args, **kwargs): + raise NotImplementedError("should be implemented in subclass.") + + +if __name__ == '__main__': + import paddle + import re + from types import SimpleNamespace + + paddle.set_device('gpu') # 若要跑 GPU,把它改为 'gpu' 并确保 paddle 已编译 GPU + + # Helper config wrapper that supports both dict-like .get(...) and attribute access (.smooth_quant) + class Cfg(dict): + def __getattr__(self, name): + if name in self: + return self[name] + raise AttributeError(name) + def __setattr__(self, name, val): + self[name] = val + + # Build a simple quant_config compatible with the refactor callbacks + quant_config = Cfg() + quant_config['weight'] = Cfg(n_bits=8, sym=True) + quant_config['act'] = Cfg(n_bits=8, sym=True) + # placeholders for optional subconfigs + quant_config['smooth_quant'] = None + quant_config['quarot'] = None + quant_config['viditq'] = None + + + # Minimal Model subclass that creates layers first, then calls QuantModel.__init__ + class MyModel(QuantModel): + def __init__(self, quant_cfg): + # create layers BEFORE calling QuantModel.__init__ because parent __init__ triggers refactor + # two linear layers for testing + super().__init__(quant_cfg) + self.fc1 = nn.Linear(16, 16, bias_attr=True) + self.fc2 = nn.Linear(16, 8, bias_attr=True) + # call parent constructor to set up quant logic (this will call quant_layer_refactor) + + + def forward(self, x): + # a simple forward that passes x through the two (possibly quantized) linear layers + # expected input shape: [B, N_token, C], where C==16 + # Just apply sequentially + x = self.fc1(x) + x = paddle.nn.functional.relu(x) + x = self.fc2(x) + return x + + # Instantiate model and run tests + model = MyModel(quant_config) + print("Model after quant_layer_refactor:") + print(model) + + # Create a dummy input: B=2, N_token=3, C=16 + B, N_token, C = 2, 3, 16 + x = paddle.randn([B, N_token, C], dtype='float32') + + # Ensure quant_mode True (default in QuantizedLinear) + print("\n--- Forward (quant mode) ---") + # run forward (QuantModel.forward is implemented by MyModel) + y_q = model(x) + print("Output shape (quant):", y_q.shape) + print("Sample outputs (quant):", y_q.flatten()[:6].numpy()) + + # Save quant params dict after one forward (so quantizers are initialized) + model.save_quant_params_dict() + print("\nSaved quant_param_dict keys:", list(model.quant_param_dict.keys())) + # print a sample layer's delta/zero_point shapes if available + if len(model.quant_param_dict) > 0: + sample_name = list(model.quant_param_dict.keys())[0] + sample = model.quant_param_dict[sample_name] + print(f"Sample layer '{sample_name}' quant params:") + print(" delta:", getattr(sample.get('delta', None), 'shape', None)) + print(" zero_point:", getattr(sample.get('zero_point', None), 'shape', None)) + + # Create a second new model and load the saved quant params into it to verify load logic + print("\n--- Load quant params into a fresh model and forward ---") + model2 = MyModel(quant_config) + # before loading, model2.quant_param_dict should be empty + print("model2.quant_param_dict keys (before load):", list(model2.quant_param_dict.keys())) + # load saved params + model2.load_quant_params_dict(model.quant_param_dict) + # set init_done flags + model2.set_init_done() + # run forward in quant mode + y_q2 = model2(x) + print("Output shape (quant) after load:", y_q2.shape) + print("Sample outputs (quant) after load:", y_q2.flatten()[:6].numpy()) + + # Compare outputs (they may not be identical due to randomness and how quantization is implemented, + # but this at least ensures code paths run without errors). + try: + diff = (y_q - y_q2).abs().mean().numpy().item() + print(f"\nMean absolute difference between model and model2 outputs: {diff:.6f}") + except Exception: + print("Could not compute difference (maybe shapes/types mismatch).") + + # Also run FP mode by setting quant_mode=False on quantized layers + print("\n--- Forward (FP mode) ---") + # Flip quant_mode on submodules that are QuantizedLinear + def set_fp(submodule, **kwargs): + if hasattr(submodule, 'quant_mode'): + submodule.quant_mode = False + + apply_func_to_submodules(model, class_type=QuantizedLinear, function=set_fp) + y_fp = model(x) + print("Output shape (fp):", y_fp.shape) + print("Sample outputs (fp):", y_fp.flatten()[:6].numpy()) + + print("\n✅ __main__ smoke test completed.") diff --git a/ppdiffusers/examples/pixart_quant/quant_utils/qdiff/smooth_quant/__init__.py b/ppdiffusers/examples/pixart_quant/quant_utils/qdiff/smooth_quant/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ppdiffusers/examples/pixart_quant/quant_utils/qdiff/smooth_quant/sq_quant_layer.py b/ppdiffusers/examples/pixart_quant/quant_utils/qdiff/smooth_quant/sq_quant_layer.py new file mode 100644 index 000000000..034abf02e --- /dev/null +++ b/ppdiffusers/examples/pixart_quant/quant_utils/qdiff/smooth_quant/sq_quant_layer.py @@ -0,0 +1,95 @@ +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from qdiff.base.quant_layer import QuantizedLinear # 请确保这是 Paddle 版本的基类 + +class SQQuantizedLinear(QuantizedLinear): + """ + Base quantized linear layer for Paddle, + static weight quantization + dynamic activation quantization. + """ + + def __init__( + self, + in_features: int, + out_features: int, + bias: bool, + device: None, + quant_config: dict, + fp_module: paddle.nn.Linear, + ) -> None: + super().__init__(in_features, out_features, bias, device, quant_config, fp_module) + + self.alpha = quant_config.smooth_quant.alpha + self.channel_mask = None # 在 PTQ 阶段外部赋值 + + def get_channel_mask(self, act_mask): + """ + act_mask: 激活的通道级最大值(shape [C_in]) + 生成 channel_mask 并存到 self.channel_mask + """ + # weight: [C_out, C_in] + weight_abs = paddle.abs(self.fp_module.weight) + # 在 axis=0(按行取最大)得到每列的最大值 -> shape [C_in] + weight_mask = paddle.max(weight_abs, axis=1) + # 避免负数**alpha 造成 nan(这里 weight_mask 和 act_mask 已取 abs) + channel_mask = (paddle.abs(weight_mask) ** self.alpha) / (paddle.abs(act_mask) ** (1.0 - self.alpha)) + self.channel_mask = channel_mask + + # 检查 inf + if paddle.isinf(self.channel_mask).any().item(): + raise AssertionError("inf exists in channel_mask") + + def update_quantized_weight_scaled(self): + assert self.channel_mask is not None, "channel_mask is not set" + C_in, C_out = self.fp_module.weight.shape + + # 关闭 w_quantizer 的 init 标志以重新计算 + self.w_quantizer.init_done = False + + # 对权重按通道缩放后量化 + scaled = self.fp_module.weight.t() / self.channel_mask.reshape([1, C_in]) + q_w = self.w_quantizer(scaled) + + # 在 Paddle 中推荐用 set_value 更新参数的值 + # q_w 必须是与 self.weight 形状相同的 Tensor + self.weight.set_value(q_w.t()) + + # 检查 nan + if paddle.isnan(self.weight).any().item(): + raise AssertionError("nan exists in weight") + + self.w_quantizer.init_done = True + + def forward(self, x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor: + """ + 输入形状: [B, N_token, C] + """ + if not getattr(self, "quant_mode", False): + # 使用 FP module(注意 paddle 的 Linear 通常只接受输入) + return self.fp_module(x) + + # quant 模式 + B, N_token, C = x.shape + + # 用 channel_mask 缩放激活 + x = x * self.channel_mask.reshape([1, 1, C]) + + # 先展平以便 activation quantizer 工作(形状 [B*N_token, C]) + x = x.reshape([B * N_token, -1]) + + # 检查 nan + if paddle.isnan(x).any().item(): + raise AssertionError("nan exists in x") + + # 激活量化(假设 a_quantizer 是兼容 Paddle 的 callable) + x = self.a_quantizer(x) + + # 恢复形状 + x = x.reshape([B, N_token, C]) + + # 使用量化(或 dequant 后)的 weight 做线性变换 + # Paddle 的 F.linear(input, weight, bias=None) + y = F.linear(x, self.weight, self.bias) + + return y diff --git a/ppdiffusers/examples/pixart_quant/quant_utils/qdiff/utils.py b/ppdiffusers/examples/pixart_quant/quant_utils/qdiff/utils.py new file mode 100644 index 000000000..a8ee2d53a --- /dev/null +++ b/ppdiffusers/examples/pixart_quant/quant_utils/qdiff/utils.py @@ -0,0 +1,129 @@ +import numpy as np +import random +import os +import logging.config + +import paddle +import paddle.nn as nn +from typing import Callable, Dict, Any, Optional + +def apply_func_to_submodules( + module: paddle.nn.Layer, + class_type: type, + function: Callable, + parent_name: str = "", + return_d: Optional[Dict[str, Any]] = None, + **kwargs, +): + """ + Recursively iterate through direct submodules of a Paddle `Layer` and apply `function` + when a submodule is instance of `class_type`. + + Args: + module (paddle.nn.Layer): the root module to traverse. + class_type (type): target class type to match. + function (callable): function to call for matched submodules. Called as function(submodule, **kwargs). + parent_name (str): name of parent module for building full name. + return_d (dict|None): optional dict to collect function return values keyed by full_name. + **kwargs: extra keyword args forwarded to `function`. Keys `'name'`, `'full_name'`, and + `'parent_module'` will be added/overwritten for each call. + Returns: + return_d if provided, else None. + """ + # Prefer to use the internal _sub_layers dict to get immediate children (name -> layer) + # Fallback: try to use named_sublayers() but note it yields nested sublayers recursively. + sub_items = None + if hasattr(module, "_sub_layers"): + # _sub_layers is an OrderedDict mapping local_name -> Layer + sub_items = list(module._sub_layers.items()) + else: + # Fallback (may include nested children): try to get immediate pairs from named_sublayers + try: + # named_sublayers yields (name, layer) but may be recursive; we keep as fallback + sub_items = list(module.named_sublayers()) + except Exception: + sub_items = [] + + for name, submodule in sub_items: + full_name = f"{parent_name}.{name}" if parent_name else name + + # copy kwargs to avoid mutating caller's dict across recursion + local_kwargs = dict(kwargs) + # pass contextual info + local_kwargs['name'] = name + local_kwargs['full_name'] = full_name + local_kwargs['parent_module'] = module + + if isinstance(submodule, class_type): + if return_d is not None: + return_d[full_name] = function(submodule, **local_kwargs) + else: + function(submodule, **local_kwargs) + + # Recurse into the submodule's children + apply_func_to_submodules(submodule, class_type, function, full_name, return_d, **local_kwargs) + + if return_d is not None: + return return_d + +class StraightThrough(nn.Layer): + def __init__(self, channel_num: int = 1): + super().__init__() + + def forward(self, input): + return input + + +def seed_everything(seed=42): + """ + 固定 PaddlePaddle 的随机数种子,以确保实验可复现。 + """ + random.seed(seed) + np.random.seed(seed) + os.environ['PYTHONHASHSEED'] = str(seed) + + # 固定 paddle 的随机种子 + paddle.seed(seed) + + # Paddle 目前不像 torch 那样有 cudnn.deterministic / benchmark + # 但我们可以设置确定性计算选项,减少浮动 + paddle.framework.random._manual_program_seed(seed) + # 如果使用动态图,这个函数也能确保算子初始化一致 + if paddle.get_device().startswith("gpu"): + print(f"[Info] Using GPU with fixed seed {seed}") + else: + print(f"[Info] Using {paddle.get_device()} with fixed seed {seed}") + +def setup_logging(log_file): + logging_config = { + 'version': 1, + 'disable_existing_loggers': False, + 'formatters': { + 'standard': { + 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + }, + }, + 'handlers': { + 'console': { + 'class': 'logging.StreamHandler', + 'level': 'DEBUG', + 'formatter': 'standard', + 'stream': 'ext://sys.stdout' + }, + 'file': { + 'class': 'logging.FileHandler', + 'level': 'DEBUG', + 'formatter': 'standard', + 'filename': log_file, + 'mode': 'a', + } + }, + 'loggers': { + '': { + 'handlers': ['console', 'file'], + 'level': 'DEBUG', + 'propagate': True + } + } + } + logging.config.dictConfig(logging_config) diff --git a/ppdiffusers/examples/pixart_quant/quant_utils/setup.cfg b/ppdiffusers/examples/pixart_quant/quant_utils/setup.cfg new file mode 100644 index 000000000..2f92d4205 --- /dev/null +++ b/ppdiffusers/examples/pixart_quant/quant_utils/setup.cfg @@ -0,0 +1,22 @@ +[metadata] +name = qdiff +version = 0.1.0 +author = Your Name +author_email = your.email@example.com +description = A PaddlePaddle quantization utils library +long_description = '' +long_description_content_type = text/markdown +license = MIT +classifiers = + Programming Language :: Python :: 3 + License :: OSI Approved :: MIT License + Operating System :: OS Independent + +[options] +packages = find: +python_requires = >=3.8 +install_requires = + numpy + +[options.package_data] +* = *.yaml, *.json # 如果你有额外的资源文件 diff --git a/ppdiffusers/examples/pixart_quant/readme.md b/ppdiffusers/examples/pixart_quant/readme.md new file mode 100644 index 000000000..725f9e081 --- /dev/null +++ b/ppdiffusers/examples/pixart_quant/readme.md @@ -0,0 +1,81 @@ +# Pixart Quant + +本项目基于 **PaddlePaddle (ppdiffusers)** 和 **QDiff** 实现扩散模型的量化与推理加速。 +整体流程分为三个阶段:**校准数据生成(calib_data)**、**参数调优(ptq)**、**推理(inference)**。 + +--- + +## 🧩 1. 环境配置 + +### 1.1 基础环境 + +安装示例: +```bash +# 创建虚拟环境 +conda create -n qdiff python=3.9 +conda activate qdiff + +# 安装 PaddlePaddle GPU 版本(以ppdiffusers官方文档为准) +pip install paddlepaddle-gpu==2.6.0.post117 -f https://www.paddlepaddle.org.cn/whl/mkl/stable.html + +# 安装 ppdiffusers 及相关库(以ppdiffusers官方文档为准) +pip install ppdiffusers + +# 安装 qdiff (本地版本) +cd ./quant_utils +pip install -e . +``` +## 🚀 2. 运行方法 + +整个流程分为 三个阶段。建议依次运行。 + +### 2.1 阶段一:生成校准数据(calib_data) + +该阶段用于提取模型中关键层的特征分布,用于后续量化参数调优。 +```python +CUDA_VISIBLE_DEVICES=$GPU_ID +python get_calib_data.py \ +--quant-config "./configs/${CFG}" \ +--log "./logs/${LOG}" \ +--prompt $PROMPT_PATH +``` + +输出: + +./logs/${LOG} 文件夹中保存特征 Tensor。 + +### 2.2 阶段二:确定量化参数(PTQ, Post-Training Quantization) + +该阶段根据校准数据进行量化参数优化、scale 校正。 +```python +CUDA_VISIBLE_DEVICES=$GPU_ID +python ptq.py \ +--quant-config "./configs/${CFG}" \ +--log "./logs/${LOG}" +``` + +输出: + +./logs/${LOG} 保存优化后的量化参数。 + +### 2.3 阶段三:量化推理(Inference) + +加载量化模型参数并进行推理测试。 +```python +CUDA_VISIBLE_DEVICES=$GPU_ID +python quant_inference.py \ +--quant-config "./configs/${CFG}" \ +--log "./logs/${LOG}" +``` + +输出: + +生成的图像保存在 ./logs/${LOG}/generated_images 目录。 + +以上可通过直接调用main.sh实现 +```bash +. example/pixart/main.sh +``` + +## 4.技术文章 +This repo's main methods come from our ICLR'25 paper: [ViDiT-Q: Efficient and Accurate Quantization of Diffusion Transformers for Image and Video Generation](https://arxiv.org/abs/2406.02540). \ No newline at end of file