Add remote_decode to remote_utils

hlky · hlky · commit 47498fbd6fb0 · 2025-02-25T06:44:17.000Z
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
@@ -116,6 +116,7 @@
     unscale_lora_layers,
 )
 from .pil_utils import PIL_INTERPOLATION, make_image_grid, numpy_to_pil, pt_to_pil
+from .remote_utils import remote_decode
 from .state_dict_utils import (
     convert_all_state_dict_to_peft,
     convert_state_dict_to_diffusers,
diff --git a/src/diffusers/utils/remote_utils.py b/src/diffusers/utils/remote_utils.py
@@ -0,0 +1,119 @@
+import base64
+import io
+import json
+from typing import List, Literal, Optional, Union, cast
+
+import requests
+import torch
+from PIL import Image
+from safetensors.torch import _tobytes
+
+from ..image_processor import VaeImageProcessor
+from ..video_processor import VideoProcessor
+
+
+DTYPE_MAP = {
+    "float16": torch.float16,
+    "float32": torch.float32,
+    "bfloat16": torch.bfloat16,
+    "uint8": torch.uint8,
+}
+
+
+def remote_decode(
+    endpoint: str,
+    tensor: torch.Tensor,
+    processor: Optional[Union[VaeImageProcessor, VideoProcessor]] = None,
+    do_scaling: bool = True,
+    output_type: Literal["mp4", "pil", "pt"] = "pil",
+    image_format: Literal["png", "jpg"] = "jpg",
+    partial_postprocess: bool = False,
+    input_tensor_type: Literal["base64", "binary"] = "base64",
+    output_tensor_type: Literal["base64", "binary"] = "base64",
+    height: Optional[int] = None,
+    width: Optional[int] = None,
+) -> Union[Image.Image, List[Image.Image], bytes, torch.Tensor]:
+    if tensor.ndim == 3 and height is None and width is None:
+        raise ValueError("`height` and `width` required for packed latents.")
+    if output_type == "pt" and partial_postprocess is False and processor is None:
+        raise ValueError("`processor` is required with `output_type='pt' and `partial_postprocess=False`.")
+    headers = {}
+    parameters = {
+        "do_scaling": do_scaling,
+        "output_type": output_type,
+        "partial_postprocess": partial_postprocess,
+        "shape": list(tensor.shape),
+        "dtype": str(tensor.dtype).split(".")[-1],
+    }
+    if height is not None and width is not None:
+        parameters["height"] = height
+        parameters["width"] = width
+    tensor_data = _tobytes(tensor, "tensor")
+    if input_tensor_type == "base64":
+        headers["Content-Type"] = "tensor/base64"
+    elif input_tensor_type == "binary":
+        headers["Content-Type"] = "tensor/binary"
+    if output_type == "pil" and image_format == "jpg" and processor is None:
+        headers["Accept"] = "image/jpeg"
+    elif output_type == "pil" and image_format == "png" and processor is None:
+        headers["Accept"] = "image/png"
+    elif (output_tensor_type == "base64" and output_type == "pt") or (
+        output_tensor_type == "base64" and output_type == "pil" and processor is not None
+    ):
+        headers["Accept"] = "tensor/base64"
+    elif (output_tensor_type == "binary" and output_type == "pt") or (
+        output_tensor_type == "binary" and output_type == "pil" and processor is not None
+    ):
+        headers["Accept"] = "tensor/binary"
+    elif output_type == "mp4":
+        headers["Accept"] = "text/plain"
+    if input_tensor_type == "base64":
+        kwargs = {"json": {"inputs": base64.b64encode(tensor_data).decode("utf-8")}}
+    elif input_tensor_type == "binary":
+        kwargs = {"data": tensor_data}
+    response = requests.post(endpoint, params=parameters, **kwargs, headers=headers)
+    if not response.ok:
+        raise RuntimeError(response.json())
+    if output_type == "pt" or (output_type == "pil" and processor is not None):
+        if output_tensor_type == "base64":
+            content = response.json()
+            output_tensor = base64.b64decode(content["inputs"])
+            parameters = content["parameters"]
+            shape = parameters["shape"]
+            dtype = parameters["dtype"]
+        elif output_tensor_type == "binary":
+            output_tensor = response.content
+            parameters = response.headers
+            shape = json.loads(parameters["shape"])
+            dtype = parameters["dtype"]
+        torch_dtype = DTYPE_MAP[dtype]
+        output_tensor = torch.frombuffer(bytearray(output_tensor), dtype=torch_dtype).reshape(shape)
+    if output_type == "pt":
+        if partial_postprocess:
+            output = [Image.fromarray(image.numpy()) for image in output_tensor]
+            if len(output) == 1:
+                output = output[0]
+        else:
+            if processor is None:
+                output = output_tensor
+            else:
+                if isinstance(processor, VideoProcessor):
+                    output = cast(
+                        List[Image.Image],
+                        processor.postprocess_video(output_tensor, output_type="pil")[0],
+                    )
+                else:
+                    output = cast(
+                        Image.Image,
+                        processor.postprocess(output_tensor, output_type="pil")[0],
+                    )
+    elif output_type == "pil" and processor is None:
+        output = Image.open(io.BytesIO(response.content)).convert("RGB")
+    elif output_type == "pil" and processor is not None:
+        output = [
+            Image.fromarray(image)
+            for image in (output_tensor.permute(0, 2, 3, 1).float().numpy() * 255).round().astype("uint8")
+        ]
+    elif output_type == "mp4":
+        output = response.content
+    return output

Original file line number	Diff line number	Diff line change
`@@ -116,6 +116,7 @@`
`116`	`116`	`unscale_lora_layers,`
`117`	`117`	`)`
`118`	`118`	`from .pil_utils import PIL_INTERPOLATION, make_image_grid, numpy_to_pil, pt_to_pil`
	`119`	`+from .remote_utils import remote_decode`
`119`	`120`	`from .state_dict_utils import (`
`120`	`121`	`convert_all_state_dict_to_peft,`
`121`	`122`	`convert_state_dict_to_diffusers,`