-
Couldn't load subscription status.
- Fork 6.5k
Add remote_decode to remote_utils
#10898
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 33 commits
47498fb
5fb50f1
76f79b3
303e920
8c405d5
19414df
2a2157e
f80ef6d
2c572f7
1978a8a
e55139b
4773420
54280dd
2af1995
d80d66c
05b39ab
c2a2daf
1c4fdea
f03a105
7e7af59
d16c855
485d99e
2937eb2
86c2236
b10ea13
7df21f2
562a4c0
9a39e35
217e161
3712dc3
5302645
3f69f92
08ffc8f
6c2f123
82aa5cd
7151510
9f6d91f
4c24111
9c39564
ca53835
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,47 @@ | ||
| <!--Copyright 2024 The HuggingFace Team. All rights reserved. | ||
|
|
||
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | ||
| the License. You may obtain a copy of the License at | ||
|
|
||
| http://www.apache.org/licenses/LICENSE-2.0 | ||
|
|
||
| Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on | ||
| an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||
| specific language governing permissions and limitations under the License. | ||
| --> | ||
|
|
||
| # Hybrid Inference | ||
|
|
||
| **Empowering local AI builders with Hybrid Inference** | ||
|
|
||
| > [!TIP] | ||
| > [Hybrid Inference](https://huggingface.co/blog/remote_vae) is an experimental feature. | ||
|
|
||
| --- | ||
|
|
||
| ## Why use Hybrid Inference? | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so here basically we listed the type of end points we offer, I think we can create a section for that later |
||
|
|
||
| Hybrid Inference offers a fast and simple way to offload local generation requirements. | ||
|
|
||
| * **VAE Decode:** Quickly decode latents to images without comprimising quality or slowing down your workflow. | ||
| * **VAE Encode (coming soon):** Encode images to latents for generation or training. | ||
| * **Text Encoders (coming soon):** Compute text embeddings for prompts without comprimising quality or slowing down your workflow. | ||
|
|
||
hlky marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| Feedback can be provided [here](https://github.com/huggingface/diffusers/issues/new?template=remote-vae-pilot-feedback.yml). | ||
|
|
||
| --- | ||
|
|
||
| ## Key Benefits | ||
|
|
||
| - 🚀 **Reduced Requirements:** Access powerful models without expensive hardware. | ||
|
||
| - 🎯 **Diverse Use Cases:** Fully compatible with Diffusers 🧨 and the wider community. | ||
| - 🔧 **Developer-Friendly:** Simple requests, fast responses. | ||
|
|
||
| --- | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's add a section to showcase Real-World Use cases! we can link to comfy nodes etc |
||
|
|
||
| ## Contents | ||
|
|
||
| The documentation is organized into two sections: | ||
|
|
||
| * **Getting Started** Learn the basics of how to use Hybrid Inference. | ||
| * **API Reference** Dive into task-specific settings and parameters. | ||
hlky marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,334 @@ | ||
| # coding=utf-8 | ||
| # Copyright 2025 HuggingFace Inc. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| import io | ||
| import json | ||
| from typing import List, Literal, Optional, Union, cast | ||
|
|
||
| import requests | ||
|
|
||
| from .deprecation_utils import deprecate | ||
| from .import_utils import is_safetensors_available, is_torch_available | ||
|
|
||
|
|
||
| if is_torch_available(): | ||
| import torch | ||
|
|
||
| from ..image_processor import VaeImageProcessor | ||
| from ..video_processor import VideoProcessor | ||
|
|
||
| if is_safetensors_available(): | ||
| import safetensors.torch | ||
|
|
||
| DTYPE_MAP = { | ||
| "float16": torch.float16, | ||
| "float32": torch.float32, | ||
| "bfloat16": torch.bfloat16, | ||
| "uint8": torch.uint8, | ||
| } | ||
|
|
||
|
|
||
| from PIL import Image | ||
|
|
||
|
|
||
| def detect_image_type(data: bytes) -> str: | ||
| if data.startswith(b"\xff\xd8"): | ||
| return "jpeg" | ||
| elif data.startswith(b"\x89PNG\r\n\x1a\n"): | ||
| return "png" | ||
| elif data.startswith(b"GIF87a") or data.startswith(b"GIF89a"): | ||
| return "gif" | ||
| elif data.startswith(b"BM"): | ||
| return "bmp" | ||
| return "unknown" | ||
|
|
||
|
|
||
| def check_inputs( | ||
| endpoint: str, | ||
| tensor: "torch.Tensor", | ||
| processor: Optional[Union["VaeImageProcessor", "VideoProcessor"]] = None, | ||
| do_scaling: bool = True, | ||
| scaling_factor: Optional[float] = None, | ||
| shift_factor: Optional[float] = None, | ||
| output_type: Literal["mp4", "pil", "pt"] = "pil", | ||
| return_type: Literal["mp4", "pil", "pt"] = "pil", | ||
| image_format: Literal["png", "jpg"] = "jpg", | ||
| partial_postprocess: bool = False, | ||
| input_tensor_type: Literal["binary"] = "binary", | ||
| output_tensor_type: Literal["binary"] = "binary", | ||
| height: Optional[int] = None, | ||
| width: Optional[int] = None, | ||
| ): | ||
| if tensor.ndim == 3 and height is None and width is None: | ||
| raise ValueError("`height` and `width` required for packed latents.") | ||
| if ( | ||
| output_type == "pt" | ||
| and return_type == "pil" | ||
| and not partial_postprocess | ||
| and not isinstance(processor, (VaeImageProcessor, VideoProcessor)) | ||
| ): | ||
| raise ValueError("`processor` is required.") | ||
| if do_scaling and scaling_factor is None: | ||
| deprecate( | ||
| "do_scaling", | ||
| "1.0.0", | ||
| "`do_scaling` is deprecated, pass `scaling_factor` and `shift_factor` if required.", | ||
| standard_warn=False, | ||
| ) | ||
|
|
||
|
|
||
| def postprocess( | ||
hlky marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| response: requests.Response, | ||
| processor: Optional[Union["VaeImageProcessor", "VideoProcessor"]] = None, | ||
| output_type: Literal["mp4", "pil", "pt"] = "pil", | ||
| return_type: Literal["mp4", "pil", "pt"] = "pil", | ||
| partial_postprocess: bool = False, | ||
| ): | ||
| if output_type == "pt" or (output_type == "pil" and processor is not None): | ||
| output_tensor = response.content | ||
| parameters = response.headers | ||
| shape = json.loads(parameters["shape"]) | ||
| dtype = parameters["dtype"] | ||
| torch_dtype = DTYPE_MAP[dtype] | ||
| output_tensor = torch.frombuffer(bytearray(output_tensor), dtype=torch_dtype).reshape(shape) | ||
| if output_type == "pt": | ||
| if partial_postprocess: | ||
| if return_type == "pil": | ||
| output = [Image.fromarray(image.numpy()) for image in output_tensor] | ||
| if len(output) == 1: | ||
| output = output[0] | ||
| elif return_type == "pt": | ||
| output = output_tensor | ||
| else: | ||
| if processor is None or return_type == "pt": | ||
| output = output_tensor | ||
| else: | ||
| if isinstance(processor, VideoProcessor): | ||
| output = cast( | ||
| List[Image.Image], | ||
| processor.postprocess_video(output_tensor, output_type="pil")[0], | ||
| ) | ||
| else: | ||
| output = cast( | ||
| Image.Image, | ||
| processor.postprocess(output_tensor, output_type="pil")[0], | ||
| ) | ||
| elif output_type == "pil" and return_type == "pil" and processor is None: | ||
| output = Image.open(io.BytesIO(response.content)).convert("RGB") | ||
| detected_format = detect_image_type(response.content) | ||
| output.format = detected_format | ||
| elif output_type == "pil" and processor is not None: | ||
| if return_type == "pil": | ||
| output = [ | ||
| Image.fromarray(image) | ||
| for image in (output_tensor.permute(0, 2, 3, 1).float().numpy() * 255).round().astype("uint8") | ||
| ] | ||
| elif return_type == "pt": | ||
| output = output_tensor | ||
| elif output_type == "mp4" and return_type == "mp4": | ||
| output = response.content | ||
| return output | ||
|
|
||
|
|
||
| def prepare( | ||
| tensor: "torch.Tensor", | ||
| processor: Optional[Union["VaeImageProcessor", "VideoProcessor"]] = None, | ||
| do_scaling: bool = True, | ||
| scaling_factor: Optional[float] = None, | ||
| shift_factor: Optional[float] = None, | ||
| output_type: Literal["mp4", "pil", "pt"] = "pil", | ||
| image_format: Literal["png", "jpg"] = "jpg", | ||
| partial_postprocess: bool = False, | ||
| height: Optional[int] = None, | ||
| width: Optional[int] = None, | ||
| ): | ||
| headers = {} | ||
| parameters = { | ||
| "image_format": image_format, | ||
| "output_type": output_type, | ||
| "partial_postprocess": partial_postprocess, | ||
| "shape": list(tensor.shape), | ||
| "dtype": str(tensor.dtype).split(".")[-1], | ||
| } | ||
| if do_scaling and scaling_factor is not None: | ||
| parameters["scaling_factor"] = scaling_factor | ||
| if do_scaling and shift_factor is not None: | ||
| parameters["shift_factor"] = shift_factor | ||
| if do_scaling and scaling_factor is None: | ||
| parameters["do_scaling"] = do_scaling | ||
| elif do_scaling and scaling_factor is None and shift_factor is None: | ||
| parameters["do_scaling"] = do_scaling | ||
| if height is not None and width is not None: | ||
| parameters["height"] = height | ||
| parameters["width"] = width | ||
| headers["Content-Type"] = "tensor/binary" | ||
| headers["Accept"] = "tensor/binary" | ||
| if output_type == "pil" and image_format == "jpg" and processor is None: | ||
| headers["Accept"] = "image/jpeg" | ||
| elif output_type == "pil" and image_format == "png" and processor is None: | ||
| headers["Accept"] = "image/png" | ||
| elif output_type == "mp4": | ||
| headers["Accept"] = "text/plain" | ||
| tensor_data = safetensors.torch._tobytes(tensor, "tensor") | ||
| return {"data": tensor_data, "params": parameters, "headers": headers} | ||
|
|
||
|
|
||
| def remote_decode( | ||
| endpoint: str, | ||
| tensor: "torch.Tensor", | ||
| processor: Optional[Union["VaeImageProcessor", "VideoProcessor"]] = None, | ||
| do_scaling: bool = True, | ||
| scaling_factor: Optional[float] = None, | ||
| shift_factor: Optional[float] = None, | ||
| output_type: Literal["mp4", "pil", "pt"] = "pil", | ||
| return_type: Literal["mp4", "pil", "pt"] = "pil", | ||
| image_format: Literal["png", "jpg"] = "jpg", | ||
| partial_postprocess: bool = False, | ||
| input_tensor_type: Literal["binary"] = "binary", | ||
| output_tensor_type: Literal["binary"] = "binary", | ||
| height: Optional[int] = None, | ||
| width: Optional[int] = None, | ||
| ) -> Union[Image.Image, List[Image.Image], bytes, "torch.Tensor"]: | ||
| """ | ||
sayakpaul marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| Hugging Face Hybrid Inference that allow running VAE decode remotely. | ||
|
|
||
| Args: | ||
| endpoint (`str`): | ||
| Endpoint for Remote Decode. | ||
| tensor (`torch.Tensor`): | ||
| Tensor to be decoded. | ||
| processor (`VaeImageProcessor` or `VideoProcessor`, *optional*): | ||
| Used with `return_type="pt"`, and `return_type="pil"` for Video models. | ||
| do_scaling (`bool`, default `True`, *optional*): | ||
| **DEPRECATED**. **pass `scaling_factor`/`shift_factor` instead.** **still set | ||
| do_scaling=None/do_scaling=False for no scaling until option is removed** When `True` scaling e.g. `latents | ||
| / self.vae.config.scaling_factor` is applied remotely. If `False`, input must be passed with scaling | ||
| applied. | ||
| scaling_factor (`float`, *optional*): | ||
| Scaling is applied when passed e.g. [`latents / | ||
| self.vae.config.scaling_factor`](https://github.com/huggingface/diffusers/blob/7007febae5cff000d4df9059d9cf35133e8b2ca9/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L1083C37-L1083C77). | ||
| - SD v1: 0.18215 | ||
| - SD XL: 0.13025 | ||
| - Flux: 0.3611 | ||
| If `None`, input must be passed with scaling applied. | ||
| shift_factor (`float`, *optional*): | ||
| Shift is applied when passed e.g. `latents + self.vae.config.shift_factor`. | ||
| - Flux: 0.1159 | ||
| If `None`, input must be passed with scaling applied. | ||
| output_type (`"mp4"` or `"pil"` or `"pt", default `"pil"): | ||
| **Endpoint** output type. Subject to change. Report feedback on preferred type. | ||
|
|
||
| `"mp4": Supported by video models. Endpoint returns `bytes` of video. `"pil"`: Supported by image and video | ||
| models. | ||
| Image models: Endpoint returns `bytes` of an image in `image_format`. Video models: Endpoint returns | ||
| `torch.Tensor` with partial `postprocessing` applied. | ||
| Requires `processor` as a flag (any `None` value will work). | ||
| `"pt"`: Support by image and video models. Endpoint returns `torch.Tensor`. | ||
| With `partial_postprocess=True` the tensor is postprocessed `uint8` image tensor. | ||
|
|
||
| Recommendations: | ||
| `"pt"` with `partial_postprocess=True` is the smallest transfer for full quality. `"pt"` with | ||
| `partial_postprocess=False` is the most compatible with third party code. `"pil"` with | ||
| `image_format="jpg"` is the smallest transfer overall. | ||
|
|
||
| return_type (`"mp4"` or `"pil"` or `"pt", default `"pil"): | ||
| **Function** return type. | ||
|
|
||
| `"mp4": Function returns `bytes` of video. `"pil"`: Function returns `PIL.Image.Image`. | ||
| With `output_type="pil" no further processing is applied. With `output_type="pt" a `PIL.Image.Image` is | ||
| created. | ||
| `partial_postprocess=False` `processor` is required. `partial_postprocess=True` `processor` is | ||
| **not** required. | ||
| `"pt"`: Function returns `torch.Tensor`. | ||
| `processor` is **not** required. `partial_postprocess=False` tensor is `float16` or `bfloat16`, without | ||
| denormalization. `partial_postprocess=True` tensor is `uint8`, denormalized. | ||
|
|
||
| image_format (`"png"` or `"jpg"`, default `jpg`): | ||
| Used with `output_type="pil"`. Endpoint returns `jpg` or `png`. | ||
|
|
||
| partial_postprocess (`bool`, default `False`): | ||
| Used with `output_type="pt"`. `partial_postprocess=False` tensor is `float16` or `bfloat16`, without | ||
| denormalization. `partial_postprocess=True` tensor is `uint8`, denormalized. | ||
|
|
||
| input_tensor_type (`"binary"`, default `"binary"`): | ||
| Tensor transfer type. | ||
|
|
||
| output_tensor_type (`"binary"`, default `"binary"`): | ||
| Tensor transfer type. | ||
|
|
||
| height (`int`, **optional**): | ||
| Required for `"packed"` latents. | ||
|
|
||
| width (`int`, **optional**): | ||
| Required for `"packed"` latents. | ||
|
|
||
| Returns: | ||
| output (`Image.Image` or `List[Image.Image]` or `bytes` or `torch.Tensor`). | ||
| """ | ||
| if input_tensor_type == "base64": | ||
sayakpaul marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| deprecate( | ||
| "input_tensor_type='base64'", | ||
| "1.0.0", | ||
| "input_tensor_type='base64' is deprecated. Using `binary`.", | ||
| standard_warn=False, | ||
| ) | ||
| input_tensor_type = "binary" | ||
| if output_tensor_type == "base64": | ||
| deprecate( | ||
| "output_tensor_type='base64'", | ||
| "1.0.0", | ||
| "output_tensor_type='base64' is deprecated. Using `binary`.", | ||
| standard_warn=False, | ||
| ) | ||
| output_tensor_type = "binary" | ||
| check_inputs( | ||
| endpoint, | ||
| tensor, | ||
| processor, | ||
| do_scaling, | ||
| scaling_factor, | ||
| shift_factor, | ||
| output_type, | ||
| return_type, | ||
| image_format, | ||
| partial_postprocess, | ||
| input_tensor_type, | ||
| output_tensor_type, | ||
| height, | ||
| width, | ||
| ) | ||
| kwargs = prepare( | ||
| tensor=tensor, | ||
| processor=processor, | ||
| do_scaling=do_scaling, | ||
| scaling_factor=scaling_factor, | ||
| shift_factor=shift_factor, | ||
| output_type=output_type, | ||
| image_format=image_format, | ||
| partial_postprocess=partial_postprocess, | ||
| height=height, | ||
| width=width, | ||
| ) | ||
| response = requests.post(endpoint, **kwargs) | ||
| if not response.ok: | ||
| raise RuntimeError(response.json()) | ||
| output = postprocess( | ||
| response=response, | ||
| processor=processor, | ||
| output_type=output_type, | ||
| return_type=return_type, | ||
| partial_postprocess=partial_postprocess, | ||
| ) | ||
| return output | ||
Uh oh!
There was an error while loading. Please reload this page.