-
Couldn't load subscription status.
- Fork 6.5k
[Single File] Add GGUF support #9964
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 44 commits
b5eeaa4
71897b1
89ea1ee
f0bcd94
60d1385
22ed0b0
2e6d340
b5f927c
b9666c7
6dc5d22
428e44b
d7f09f2
1649936
28d3a64
c34a451
84493db
50bd784
8f604b3
afd5d7d
e1b964a
0ed31bc
af381ad
52a1bcb
66ae46e
67f1700
8abfa55
d4b88d7
30f13ed
9310035
e9303a0
e56c266
1209c3a
db9b6f3
4c0360a
aa7659b
78c7861
33eb431
9651ddc
746fd2f
e027d46
9db2396
7ee89f4
edf3e54
d3eb54f
82606cb
4f34f14
090efdb
391b5a9
e67c25a
e710bde
f59e07a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,68 @@ | ||
| <!--Copyright 2024 The HuggingFace Team. All rights reserved. | ||
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | ||
| the License. You may obtain a copy of the License at | ||
| http://www.apache.org/licenses/LICENSE-2.0 | ||
| Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on | ||
| an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||
| specific language governing permissions and limitations under the License. | ||
| --> | ||
|
|
||
| # GGUF | ||
|
|
||
| The GGUF file format is typically used to store models for inference with [GGML](https://github.com/ggerganov/ggml) and supports a variety of block wise quantization options. Diffusers supports loading checkpoints prequantized and saved in the GGUF format via `from_single_file` loading with Model classes. Loading GGUF checkpoints via Pipelines is currently not supported. | ||
|
|
||
| The following example will load the [FLUX.1 DEV](https://huggingface.co/black-forest-labs/FLUX.1-dev) transformer model using the GGUF Q2_K quantization variant. | ||
|
|
||
| Before starting please install gguf in your environment | ||
|
|
||
| ```shell | ||
| pip install -U gguf | ||
| ``` | ||
|
|
||
| Since GGUF is a single file format, we will be using `from_single_file` to load the model and pass in the `GGUFQuantizationConfig` when loading the model. | ||
|
|
||
| When using GGUF checkpoints, the quantized weights remain in a low memory `dtype`, typically `torch.unint8` and are dynamically dequantized and cast to the configured `compute_dtype` when running a forward pass through each module in the model. The `GGUFQuantizationConfig` allows you to set the `compute_dtype` for the forward pass of each module. The functions used for dynamic dequantizatation are based on the great work done by [city96](https://github.com/city96/ComfyUI-GGUF) | ||
|
||
|
|
||
| ```python | ||
| import torch | ||
|
|
||
| from diffusers import FluxPipeline, FluxTransformer2DModel, GGUFQuantizationConfig | ||
|
|
||
| ckpt_path = ( | ||
| "https://huggingface.co/city96/FLUX.1-dev-gguf/blob/main/flux1-dev-Q2_K.gguf" | ||
| ) | ||
| transformer = FluxTransformer2DModel.from_single_file( | ||
| ckpt_path, | ||
| quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16), | ||
| torch_dtype=torch.bfloat16, | ||
| ) | ||
| pipe = FluxPipeline.from_pretrained( | ||
| "black-forest-labs/FLUX.1-dev", | ||
| transformer=transformer, | ||
| generator=torch.manual_seed(0), | ||
| torch_dtype=torch.bfloat16, | ||
| ) | ||
| pipe.enable_model_cpu_offload() | ||
| prompt = "A cat holding a sign that says hello world" | ||
| image = pipe(prompt).images[0] | ||
| image.save("flux-gguf.png") | ||
| ``` | ||
|
|
||
| ## Supported Quantization Types | ||
|
|
||
| - BF16 | ||
| - Q4_0 | ||
| - Q4_1 | ||
| - Q5_0 | ||
| - Q5_1 | ||
| - Q8_0 | ||
| - Q2_K | ||
| - Q3_K | ||
| - Q4_K | ||
| - Q5_K | ||
| - Q6_K | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,8 +17,10 @@ | |
| from contextlib import nullcontext | ||
| from typing import Optional | ||
|
|
||
| import torch | ||
| from huggingface_hub.utils import validate_hf_hub_args | ||
|
|
||
| from ..quantizers import DiffusersAutoQuantizer | ||
| from ..utils import deprecate, is_accelerate_available, logging | ||
| from .single_file_utils import ( | ||
| SingleFileComponentError, | ||
|
|
@@ -202,6 +204,8 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = | |
| subfolder = kwargs.pop("subfolder", None) | ||
| revision = kwargs.pop("revision", None) | ||
| torch_dtype = kwargs.pop("torch_dtype", None) | ||
| quantization_config = kwargs.pop("quantization_config", None) | ||
| device = kwargs.pop("device", None) | ||
|
|
||
| if isinstance(pretrained_model_link_or_path_or_dict, dict): | ||
| checkpoint = pretrained_model_link_or_path_or_dict | ||
|
|
@@ -215,6 +219,12 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = | |
| local_files_only=local_files_only, | ||
| revision=revision, | ||
| ) | ||
| if quantization_config is not None: | ||
| hf_quantizer = DiffusersAutoQuantizer.from_config(quantization_config) | ||
| hf_quantizer.validate_environment() | ||
|
|
||
| else: | ||
| hf_quantizer = None | ||
|
Comment on lines
+234
to
+239
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For GGUF files, I'm thinking if it would be nice to allow the user to load the model without having necessarily to specify Also, what happens when the user pass a gguf without specifying the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah this is a good point! I think for most users, the entrypoint for GGUF files is going to be through
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree that this is a nice convenience. GGUF does have all the information we need to auto fetch the config (honestly it's possible to skip the config all together), but it would mean that loading semantics would be different for GGUF vs other quant types. e.g. GGUF model = FluxTransformer2DModel.from_single_file("<>.gguf")BnB and TorchAO (assuming these can be supported): model = FluxTransformer2DModel.from_single_file("<path>", quantization_config=BnBConfig)
model = FluxTransformer2DModel.from_single_file("<path>", quantization_config=TorchAOConfig)GGUF can also be used through @SunMarc if the config isn't passed you get shape mismatch errors when you hit There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
yeah I thought about that too, but I think the API for GGUF is a special case here because it has built-in config. Normally, for single-file it is just a checkpoint without config, so you will always have to pass a config (at least I think so, is it? @DN6 ). So for loading a regular quantized model (e.g. BNB) we can load it with so agree with @DN6 here I think it more important to make the same API ( but if there a way to make it consistent between from_pretrained and from_single_file and across all quant types it will be great! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also, want to know this: do we plan to support quantizing a model in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Would it make sense to at least make the user aware when the passed config and the determined config mismatch and if that could lead to unintentional consequences?
Supporting quantizing in the GGUF format (regardless of
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @yiyixuxu Yeah we can definitely support quantizing a model via single file. For GGUF I can look into in a follow up because we would have to port the quantize functions to torch (the gguf library uses numpy). We could use the gguf library interally to quantize but it's quite slow since we would have to move tensors off GPU, convert to numpy and then quantize. I think with torch AO I'm pretty sure it would work just out of the box. You would have to save it with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So, what I am hearing is saving a GGUF quantized model would be added in a follow-up PR? That is also okay but it could be quite an enabling factor for the community.
I think the porting option is more preferrable.
You mean serializing with |
||
|
|
||
| mapping_functions = SINGLE_FILE_LOADABLE_CLASSES[mapping_class_name] | ||
|
|
||
|
|
@@ -296,8 +306,36 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = | |
| with ctx(): | ||
| model = cls.from_config(diffusers_model_config) | ||
|
|
||
| # Check if `_keep_in_fp32_modules` is not None | ||
| use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and ( | ||
DN6 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| (torch_dtype == torch.float16) or hasattr(hf_quantizer, "use_keep_in_fp32_modules") | ||
| ) | ||
| if use_keep_in_fp32_modules: | ||
| keep_in_fp32_modules = cls._keep_in_fp32_modules | ||
| if not isinstance(keep_in_fp32_modules, list): | ||
| keep_in_fp32_modules = [keep_in_fp32_modules] | ||
|
|
||
| else: | ||
| keep_in_fp32_modules = [] | ||
|
|
||
| if hf_quantizer is not None: | ||
| hf_quantizer.preprocess_model( | ||
| model=model, | ||
| device_map=None, | ||
| state_dict=diffusers_format_checkpoint, | ||
| keep_in_fp32_modules=keep_in_fp32_modules, | ||
| ) | ||
|
|
||
| if is_accelerate_available(): | ||
| unexpected_keys = load_model_dict_into_meta(model, diffusers_format_checkpoint, dtype=torch_dtype) | ||
| param_device = torch.device(device) if device else torch.device("cpu") | ||
| unexpected_keys = load_model_dict_into_meta( | ||
| model, | ||
| diffusers_format_checkpoint, | ||
| dtype=torch_dtype, | ||
| device=param_device, | ||
| hf_quantizer=hf_quantizer, | ||
| keep_in_fp32_modules=keep_in_fp32_modules, | ||
| ) | ||
|
|
||
| else: | ||
| _, unexpected_keys = model.load_state_dict(diffusers_format_checkpoint, strict=False) | ||
|
|
@@ -311,7 +349,11 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = | |
| f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}" | ||
| ) | ||
|
|
||
| if torch_dtype is not None: | ||
| if hf_quantizer is not None: | ||
| hf_quantizer.postprocess_model(model) | ||
| model.hf_quantizer = hf_quantizer | ||
|
|
||
| if torch_dtype is not None and hf_quantizer is None: | ||
| model.to(torch_dtype) | ||
|
|
||
| model.eval() | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -81,8 +81,14 @@ | |
| "open_clip_sd3": "text_encoders.clip_g.transformer.text_model.embeddings.position_embedding.weight", | ||
| "stable_cascade_stage_b": "down_blocks.1.0.channelwise.0.weight", | ||
| "stable_cascade_stage_c": "clip_txt_mapper.weight", | ||
| "sd3": "model.diffusion_model.joint_blocks.0.context_block.adaLN_modulation.1.bias", | ||
| "sd35_large": "model.diffusion_model.joint_blocks.37.x_block.mlp.fc1.weight", | ||
| "sd3": [ | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need to make this change because SD3/3.5 GGUF single file checkpoints use different keys than the original model from SAI.. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Anything special for Flux? |
||
| "joint_blocks.0.context_block.adaLN_modulation.1.bias", | ||
| "model.diffusion_model.joint_blocks.0.context_block.adaLN_modulation.1.bias", | ||
| ], | ||
| "sd35_large": [ | ||
| "joint_blocks.37.x_block.mlp.fc1.weight", | ||
| "model.diffusion_model.joint_blocks.37.x_block.mlp.fc1.weight", | ||
| ], | ||
| "animatediff": "down_blocks.0.motion_modules.0.temporal_transformer.transformer_blocks.0.attention_blocks.0.pos_encoder.pe", | ||
| "animatediff_v2": "mid_block.motion_modules.0.temporal_transformer.norm.bias", | ||
| "animatediff_sdxl_beta": "up_blocks.2.motion_modules.0.temporal_transformer.norm.weight", | ||
|
|
@@ -529,13 +535,20 @@ def infer_diffusers_model_type(checkpoint): | |
| ): | ||
| model_type = "stable_cascade_stage_b" | ||
|
|
||
| elif CHECKPOINT_KEY_NAMES["sd3"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["sd3"]].shape[-1] == 9216: | ||
| if checkpoint["model.diffusion_model.pos_embed"].shape[1] == 36864: | ||
| elif any(key in checkpoint for key in CHECKPOINT_KEY_NAMES["sd3"]) and any( | ||
| checkpoint[key].shape[-1] == 9216 if key in checkpoint else False for key in CHECKPOINT_KEY_NAMES["sd3"] | ||
| ): | ||
| if "model.diffusion_model.pos_embed" in checkpoint: | ||
| key = "model.diffusion_model.pos_embed" | ||
| else: | ||
| key = "pos_embed" | ||
|
|
||
| if checkpoint[key].shape[1] == 36864: | ||
| model_type = "sd3" | ||
| elif checkpoint["model.diffusion_model.pos_embed"].shape[1] == 147456: | ||
| elif checkpoint[key].shape[1] == 147456: | ||
| model_type = "sd35_medium" | ||
|
|
||
| elif CHECKPOINT_KEY_NAMES["sd35_large"] in checkpoint: | ||
| elif any(key in checkpoint for key in CHECKPOINT_KEY_NAMES["sd35_large"]): | ||
| model_type = "sd35_large" | ||
|
|
||
| elif CHECKPOINT_KEY_NAMES["animatediff"] in checkpoint: | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.