comments

hlky · hlky · commit 3f69f92ed12d · 2025-03-01T17:18:00.000Z
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -112,10 +112,6 @@
   - local: using-diffusers/marigold_usage
     title: Marigold Computer Vision
   title: Specific pipeline examples
-- sections:
-  - local: hybrid_inference/overview
-    title: Overview
-  title: Hybrid Inference
 - sections:
   - local: training/overview
     title: Overview
@@ -632,3 +628,7 @@
       title: Video Processor
     title: Internal classes
   title: API
+- sections:
+  - local: hybrid_inference/overview
+    title: Overview
+  title: Hybrid Inference
diff --git a/docs/source/en/hybrid_inference/overview.md b/docs/source/en/hybrid_inference/overview.md
@@ -14,6 +14,9 @@ specific language governing permissions and limitations under the License.
 
 **Empowering local AI builders with Hybrid Inference**
 
+> [!TIP]
+> [Hybrid Inference](https://huggingface.co/blog/remote_vae) is an experimental feature.
+
 ---
 
 ## Why use Hybrid Inference?
@@ -24,6 +27,8 @@ Hybrid Inference offers a fast and simple way to offload local generation requir
 * **VAE Encode (coming soon):** Encode images to latents for generation or training.
 * **Text Encoders (coming soon):** Compute text embeddings for prompts without comprimising quality or slowing down your workflow.
 
+Feedback can be provided [here](https://github.com/huggingface/diffusers/issues/new?template=remote-vae-pilot-feedback.yml).
+
 ---
 
 ## Key Benefits
diff --git a/src/diffusers/utils/remote_utils.py b/src/diffusers/utils/remote_utils.py
@@ -13,8 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: `imghdr` is deprecated in Python 3.13 🙄
-import imghdr
 import io
 import json
 from typing import List, Literal, Optional, Union, cast
@@ -45,6 +43,18 @@
 from PIL import Image
 
 
+def detect_image_type(data: bytes) -> str:
+    if data.startswith(b"\xff\xd8"):
+        return "jpeg"
+    elif data.startswith(b"\x89PNG\r\n\x1a\n"):
+        return "png"
+    elif data.startswith(b"GIF87a") or data.startswith(b"GIF89a"):
+        return "gif"
+    elif data.startswith(b"BM"):
+        return "bmp"
+    return "unknown"
+
+
 def check_inputs(
     endpoint: str,
     tensor: "torch.Tensor",
@@ -117,7 +127,7 @@ def postprocess(
                     )
     elif output_type == "pil" and return_type == "pil" and processor is None:
         output = Image.open(io.BytesIO(response.content)).convert("RGB")
-        detected_format = imghdr.what(None, h=response.content)
+        detected_format = detect_image_type(response.content)
         output.format = detected_format
     elif output_type == "pil" and processor is not None:
         if return_type == "pil":
@@ -207,7 +217,7 @@ def remote_decode(
             / self.vae.config.scaling_factor` is applied remotely. If `False`, input must be passed with scaling
             applied.
         scaling_factor (`float`, *optional*):
-            Scaling is applied when passed e.g. `latents / self.vae.config.scaling_factor`.
+            Scaling is applied when passed e.g. [`latents / self.vae.config.scaling_factor`](https://github.com/huggingface/diffusers/blob/7007febae5cff000d4df9059d9cf35133e8b2ca9/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L1083C37-L1083C77).
             - SD v1: 0.18215
             - SD XL: 0.13025
             - Flux: 0.3611
diff --git a/tests/remote/test_remote_decode.py b/tests/remote/test_remote_decode.py
@@ -200,37 +200,7 @@ def test_output_tensor_type_base64_deprecation(self):
             )
 
 
-class RemoteAutoencoderKLHunyuanVideoMixin:
-    shape: Tuple[int, ...] = None
-    out_hw: Tuple[int, int] = None
-    endpoint: str = None
-    dtype: torch.dtype = None
-    scaling_factor: float = None
-    shift_factor: float = None
-    processor_cls: Union[VaeImageProcessor, VideoProcessor] = None
-    output_pil_slice: torch.Tensor = None
-    output_pt_slice: torch.Tensor = None
-    partial_postprocess_return_pt_slice: torch.Tensor = None
-    return_pt_slice: torch.Tensor = None
-    width: int = None
-    height: int = None
-
-    def get_dummy_inputs(self):
-        inputs = {
-            "endpoint": self.endpoint,
-            "tensor": torch.randn(
-                self.shape,
-                device=torch_device,
-                dtype=self.dtype,
-                generator=torch.Generator(torch_device).manual_seed(13),
-            ),
-            "scaling_factor": self.scaling_factor,
-            "shift_factor": self.shift_factor,
-            "height": self.height,
-            "width": self.width,
-        }
-        return inputs
-
+class RemoteAutoencoderKLHunyuanVideoMixin(RemoteAutoencoderKLMixin):
     def test_no_scaling(self):
         inputs = self.get_dummy_inputs()
         if inputs["scaling_factor"] is not None:
@@ -354,59 +324,11 @@ def test_output_type_pt_return_type_pt(self):
             f"{output_slice}",
         )
 
-    def test_output_type_pt_partial_postprocess_return_type_pt(self):
-        inputs = self.get_dummy_inputs()
-        output = remote_decode(output_type="pt", partial_postprocess=True, return_type="pt", **inputs)
-        self.assertTrue(isinstance(output, torch.Tensor), f"Expected `torch.Tensor` output, got {type(output)}")
-        self.assertEqual(
-            output.shape[1], self.out_hw[0], f"Expected image height {self.out_hw[0]}, got {output.shape[1]}"
-        )
-        self.assertEqual(
-            output.shape[2], self.out_hw[1], f"Expected image width {self.out_hw[0]}, got {output.shape[2]}"
-        )
-        output_slice = output[0, -3:, -3:, 0].flatten().cpu()
-        self.assertTrue(
-            torch_all_close(output_slice, self.partial_postprocess_return_pt_slice.to(output_slice.dtype), rtol=1e-2),
-            f"{output_slice}",
-        )
-
     def test_output_type_mp4(self):
         inputs = self.get_dummy_inputs()
         output = remote_decode(output_type="mp4", return_type="mp4", **inputs)
         self.assertTrue(isinstance(output, bytes), f"Expected `bytes` output, got {type(output)}")
 
-    def test_do_scaling_deprecation(self):
-        inputs = self.get_dummy_inputs()
-        inputs.pop("scaling_factor", None)
-        inputs.pop("shift_factor", None)
-        with self.assertWarns(FutureWarning) as warning:
-            _ = remote_decode(output_type="pt", partial_postprocess=True, **inputs)
-            self.assertEqual(
-                str(warning.warnings[0].message),
-                "`do_scaling` is deprecated, pass `scaling_factor` and `shift_factor` if required.",
-                str(warning.warnings[0].message),
-            )
-
-    def test_input_tensor_type_base64_deprecation(self):
-        inputs = self.get_dummy_inputs()
-        with self.assertWarns(FutureWarning) as warning:
-            _ = remote_decode(output_type="pt", input_tensor_type="base64", partial_postprocess=True, **inputs)
-            self.assertEqual(
-                str(warning.warnings[0].message),
-                "input_tensor_type='base64' is deprecated. Using `binary`.",
-                str(warning.warnings[0].message),
-            )
-
-    def test_output_tensor_type_base64_deprecation(self):
-        inputs = self.get_dummy_inputs()
-        with self.assertWarns(FutureWarning) as warning:
-            _ = remote_decode(output_type="pt", output_tensor_type="base64", partial_postprocess=True, **inputs)
-            self.assertEqual(
-                str(warning.warnings[0].message),
-                "output_tensor_type='base64' is deprecated. Using `binary`.",
-                str(warning.warnings[0].message),
-            )
-
 
 class RemoteAutoencoderKLSDv1Tests(
     RemoteAutoencoderKLMixin,
@@ -432,105 +354,105 @@ class RemoteAutoencoderKLSDv1Tests(
     return_pt_slice = torch.tensor([-0.2177, 0.0217, -0.2258, 0.0412, -0.1687, -0.1232, -0.2416, -0.2130, -0.0543])
 
 
-class RemoteAutoencoderKLSDXLTests(
-    RemoteAutoencoderKLMixin,
-    unittest.TestCase,
-):
-    shape = (
-        1,
-        4,
-        128,
-        128,
-    )
-    out_hw = (
-        1024,
-        1024,
-    )
-    endpoint = "https://fagf07t3bwf0615i.us-east-1.aws.endpoints.huggingface.cloud/"
-    dtype = torch.float16
-    scaling_factor = 0.13025
-    shift_factor = None
-    processor_cls = VaeImageProcessor
-    output_pt_slice = torch.tensor([104, 52, 23, 114, 61, 35, 108, 87, 38], dtype=torch.uint8)
-    partial_postprocess_return_pt_slice = torch.tensor([77, 86, 89, 49, 60, 75, 52, 65, 78], dtype=torch.uint8)
-    return_pt_slice = torch.tensor([-0.3945, -0.3289, -0.2993, -0.6177, -0.5259, -0.4119, -0.5898, -0.4863, -0.3845])
-
-
-class RemoteAutoencoderKLFluxTests(
-    RemoteAutoencoderKLMixin,
-    unittest.TestCase,
-):
-    shape = (
-        1,
-        16,
-        128,
-        128,
-    )
-    out_hw = (
-        1024,
-        1024,
-    )
-    endpoint = "https://fnohtuwsskxgxsnn.us-east-1.aws.endpoints.huggingface.cloud/"
-    dtype = torch.bfloat16
-    scaling_factor = 0.3611
-    shift_factor = 0.1159
-    processor_cls = VaeImageProcessor
-    output_pt_slice = torch.tensor([110, 72, 91, 62, 35, 52, 69, 55, 69], dtype=torch.uint8)
-    partial_postprocess_return_pt_slice = torch.tensor(
-        [202, 203, 203, 197, 195, 193, 189, 188, 178], dtype=torch.uint8
-    )
-    return_pt_slice = torch.tensor([0.5820, 0.5962, 0.5898, 0.5439, 0.5327, 0.5112, 0.4797, 0.4773, 0.3984])
-
-
-class RemoteAutoencoderKLFluxPackedTests(
-    RemoteAutoencoderKLMixin,
-    unittest.TestCase,
-):
-    shape = (
-        1,
-        4096,
-        64,
-    )
-    out_hw = (
-        1024,
-        1024,
-    )
-    height = 1024
-    width = 1024
-    endpoint = "https://fnohtuwsskxgxsnn.us-east-1.aws.endpoints.huggingface.cloud/"
-    dtype = torch.bfloat16
-    scaling_factor = 0.3611
-    shift_factor = 0.1159
-    processor_cls = VaeImageProcessor
-    # slices are different due to randn on different shape. we can pack the latent instead if we want the same
-    output_pt_slice = torch.tensor([96, 116, 157, 45, 67, 104, 34, 56, 89], dtype=torch.uint8)
-    partial_postprocess_return_pt_slice = torch.tensor(
-        [168, 212, 202, 155, 191, 185, 150, 180, 168], dtype=torch.uint8
-    )
-    return_pt_slice = torch.tensor([0.3198, 0.6631, 0.5864, 0.2131, 0.4944, 0.4482, 0.1776, 0.4153, 0.3176])
-
-
-class RemoteAutoencoderKLHunyuanVideoTests(
-    RemoteAutoencoderKLHunyuanVideoMixin,
-    unittest.TestCase,
-):
-    shape = (
-        1,
-        16,
-        3,
-        40,
-        64,
-    )
-    out_hw = (
-        320,
-        512,
-    )
-    endpoint = "https://lsx2injm3ts8wbvv.us-east-1.aws.endpoints.huggingface.cloud/"
-    dtype = torch.float16
-    scaling_factor = 0.476986
-    processor_cls = VideoProcessor
-    output_pt_slice = torch.tensor([112, 92, 85, 112, 93, 85, 112, 94, 85], dtype=torch.uint8)
-    partial_postprocess_return_pt_slice = torch.tensor(
-        [149, 161, 168, 136, 150, 156, 129, 143, 149], dtype=torch.uint8
-    )
-    return_pt_slice = torch.tensor([0.1656, 0.2661, 0.3157, 0.0693, 0.1755, 0.2252, 0.0127, 0.1221, 0.1708])
+# class RemoteAutoencoderKLSDXLTests(
+#     RemoteAutoencoderKLMixin,
+#     unittest.TestCase,
+# ):
+#     shape = (
+#         1,
+#         4,
+#         128,
+#         128,
+#     )
+#     out_hw = (
+#         1024,
+#         1024,
+#     )
+#     endpoint = "https://fagf07t3bwf0615i.us-east-1.aws.endpoints.huggingface.cloud/"
+#     dtype = torch.float16
+#     scaling_factor = 0.13025
+#     shift_factor = None
+#     processor_cls = VaeImageProcessor
+#     output_pt_slice = torch.tensor([104, 52, 23, 114, 61, 35, 108, 87, 38], dtype=torch.uint8)
+#     partial_postprocess_return_pt_slice = torch.tensor([77, 86, 89, 49, 60, 75, 52, 65, 78], dtype=torch.uint8)
+#     return_pt_slice = torch.tensor([-0.3945, -0.3289, -0.2993, -0.6177, -0.5259, -0.4119, -0.5898, -0.4863, -0.3845])
+
+
+# class RemoteAutoencoderKLFluxTests(
+#     RemoteAutoencoderKLMixin,
+#     unittest.TestCase,
+# ):
+#     shape = (
+#         1,
+#         16,
+#         128,
+#         128,
+#     )
+#     out_hw = (
+#         1024,
+#         1024,
+#     )
+#     endpoint = "https://fnohtuwsskxgxsnn.us-east-1.aws.endpoints.huggingface.cloud/"
+#     dtype = torch.bfloat16
+#     scaling_factor = 0.3611
+#     shift_factor = 0.1159
+#     processor_cls = VaeImageProcessor
+#     output_pt_slice = torch.tensor([110, 72, 91, 62, 35, 52, 69, 55, 69], dtype=torch.uint8)
+#     partial_postprocess_return_pt_slice = torch.tensor(
+#         [202, 203, 203, 197, 195, 193, 189, 188, 178], dtype=torch.uint8
+#     )
+#     return_pt_slice = torch.tensor([0.5820, 0.5962, 0.5898, 0.5439, 0.5327, 0.5112, 0.4797, 0.4773, 0.3984])
+
+
+# class RemoteAutoencoderKLFluxPackedTests(
+#     RemoteAutoencoderKLMixin,
+#     unittest.TestCase,
+# ):
+#     shape = (
+#         1,
+#         4096,
+#         64,
+#     )
+#     out_hw = (
+#         1024,
+#         1024,
+#     )
+#     height = 1024
+#     width = 1024
+#     endpoint = "https://fnohtuwsskxgxsnn.us-east-1.aws.endpoints.huggingface.cloud/"
+#     dtype = torch.bfloat16
+#     scaling_factor = 0.3611
+#     shift_factor = 0.1159
+#     processor_cls = VaeImageProcessor
+#     # slices are different due to randn on different shape. we can pack the latent instead if we want the same
+#     output_pt_slice = torch.tensor([96, 116, 157, 45, 67, 104, 34, 56, 89], dtype=torch.uint8)
+#     partial_postprocess_return_pt_slice = torch.tensor(
+#         [168, 212, 202, 155, 191, 185, 150, 180, 168], dtype=torch.uint8
+#     )
+#     return_pt_slice = torch.tensor([0.3198, 0.6631, 0.5864, 0.2131, 0.4944, 0.4482, 0.1776, 0.4153, 0.3176])
+
+
+# class RemoteAutoencoderKLHunyuanVideoTests(
+#     RemoteAutoencoderKLHunyuanVideoMixin,
+#     unittest.TestCase,
+# ):
+#     shape = (
+#         1,
+#         16,
+#         3,
+#         40,
+#         64,
+#     )
+#     out_hw = (
+#         320,
+#         512,
+#     )
+#     endpoint = "https://lsx2injm3ts8wbvv.us-east-1.aws.endpoints.huggingface.cloud/"
+#     dtype = torch.float16
+#     scaling_factor = 0.476986
+#     processor_cls = VideoProcessor
+#     output_pt_slice = torch.tensor([112, 92, 85, 112, 93, 85, 112, 94, 85], dtype=torch.uint8)
+#     partial_postprocess_return_pt_slice = torch.tensor(
+#         [149, 161, 168, 136, 150, 156, 129, 143, 149], dtype=torch.uint8
+#     )
+#     return_pt_slice = torch.tensor([0.1656, 0.2661, 0.3157, 0.0693, 0.1755, 0.2252, 0.0127, 0.1221, 0.1708])