feat: support gzip & zstd compression (#599)

kemingy · web-flow · commit 74f41f675d58 · 2024-11-18T20:16:43.000+08:00
* feat: support gzip &amp; zstd compression

Signed-off-by: Keming &lt;kemingy94@gmail.com&gt;

* fix args help

Signed-off-by: Keming &lt;kemingy94@gmail.com&gt;

* add compression example

Signed-off-by: Keming &lt;kemingy94@gmail.com&gt;

---------

Signed-off-by: Keming &lt;kemingy94@gmail.com&gt;
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "mosec"
-version = "0.8.9"
+version = "0.9.0"
 authors = ["Keming <kemingy94@gmail.com>", "Zichen <lkevinzc@gmail.com>"]
 edition = "2021"
 license = "Apache-2.0"
@@ -25,3 +25,5 @@ serde = "1.0"
 serde_json = "1.0"
 utoipa = "5"
 utoipa-swagger-ui = { version = "8", features = ["axum"] }
+tower = "0.5.1"
+tower-http = {version = "0.6.1", features = ["compression-zstd", "decompression-zstd", "compression-gzip", "decompression-gzip"]}
diff --git a/README.md b/README.md
@@ -193,6 +193,7 @@ More ready-to-use examples can be found in the [Example](https://mosecorg.github
 - [Customized GPU allocation](https://mosecorg.github.io/mosec/examples/env.html): deploy multiple replicas, each using different GPUs.
 - [Customized metrics](https://mosecorg.github.io/mosec/examples/metric.html): record your own metrics for monitoring.
 - [Jax jitted inference](https://mosecorg.github.io/mosec/examples/jax.html): just-in-time compilation speeds up the inference.
+- [Compression](https://mosecorg.github.io/mosec/examples/compression.html): enable request/response compression.
 - PyTorch deep learning models:
   - [sentiment analysis](https://mosecorg.github.io/mosec/examples/pytorch.html#natural-language-processing): infer the sentiment of a sentence.
   - [image recognition](https://mosecorg.github.io/mosec/examples/pytorch.html#computer-vision): categorize a given image.
diff --git a/docs/source/examples/compression.md b/docs/source/examples/compression.md
@@ -0,0 +1,33 @@
+# Compression
+
+This example demonstrates how to use the `--compression` feature for segmentation tasks. We use the example from the [Segment Anything Model 2](https://github.com/facebookresearch/sam2/blob/main/notebooks/image_predictor_example.ipynb). The request includes an image and its low resolution mask, the response is the final mask. Since there are lots of duplicate values in the mask, we can use `gzip`  or `zstd` to compress it.
+
+## Server
+
+```shell
+python examples/segment/server.py --compression
+```
+
+<details>
+<summary>segment.py</summary>
+
+```{include} ../../../examples/segment/server.py
+:code: python
+```
+
+</details>
+
+## Client
+
+```shell
+python examples/segment/client.py
+```
+
+<details>
+<summary>segment.py</summary>
+
+```{include} ../../../examples/segment/client.py
+:code: python
+```
+
+</details>
diff --git a/docs/source/examples/index.md b/docs/source/examples/index.md
@@ -16,6 +16,7 @@ pytorch
 rerank
 stable_diffusion
 validate
+compression
 ```
 
 We provide examples across different ML frameworks and for various tasks in this section.
diff --git a/examples/segment/client.py b/examples/segment/client.py
@@ -0,0 +1,52 @@
+# Copyright 2023 MOSEC Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gzip
+from http import HTTPStatus
+from io import BytesIO
+
+import httpx
+import msgpack  # type: ignore
+import numbin
+import numpy as np
+from PIL import Image  # type: ignore
+
+truck_image = Image.open(
+    BytesIO(
+        httpx.get(
+            "https://raw.githubusercontent.com/facebookresearch/sam2/main/notebooks/images/truck.jpg"
+        ).content
+    )
+)
+array = np.array(truck_image.convert("RGB"))
+# assume we have obtains the low resolution mask from the previous step
+mask = np.zeros((256, 256))
+
+resp = httpx.post(
+    "http://127.0.0.1:8000/inference",
+    content=gzip.compress(
+        msgpack.packb(  # type: ignore
+            {
+                "image": numbin.dumps(array),
+                "mask": numbin.dumps(mask),
+                "labels": [1, 1],
+                "point_coords": [[500, 375], [1125, 625]],
+            }
+        )
+    ),
+    headers={"Accept-Encoding": "gzip", "Content-Encoding": "gzip"},
+)
+assert resp.status_code == HTTPStatus.OK, resp.status_code
+res = numbin.loads(msgpack.loads(resp.content))
+assert res.shape == array.shape[:2], f"expect {array.shape[:2]}, got {res.shape}"
diff --git a/examples/segment/server.py b/examples/segment/server.py
@@ -0,0 +1,66 @@
+# Copyright 2023 MOSEC Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# refer to https://github.com/facebookresearch/sam2/blob/main/notebooks/image_predictor_example.ipynb
+
+import numbin
+import torch  # type: ignore
+from sam2.sam2_image_predictor import SAM2ImagePredictor  # type: ignore
+
+from mosec import Server, Worker, get_logger
+from mosec.mixin import MsgpackMixin
+
+logger = get_logger()
+MIN_TF32_MAJOR = 8
+
+
+class SegmentAnything(MsgpackMixin, Worker):
+    def __init__(self):
+        # select the device for computation
+        if torch.cuda.is_available():
+            device = torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            device = torch.device("mps")
+        else:
+            device = torch.device("cpu")
+        logger.info("using device: %s", device)
+
+        self.predictor = SAM2ImagePredictor.from_pretrained(
+            "facebook/sam2-hiera-large", device=device
+        )
+
+        if device.type == "cuda":
+            # use bfloat16
+            torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
+            # turn on tf32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
+            if torch.cuda.get_device_properties(0).major >= MIN_TF32_MAJOR:
+                torch.backends.cuda.matmul.allow_tf32 = True
+                torch.backends.cudnn.allow_tf32 = True
+
+    def forward(self, data: dict) -> bytes:
+        with torch.inference_mode():
+            self.predictor.set_image(numbin.loads(data["image"]))
+            masks, _, _ = self.predictor.predict(
+                point_coords=data["point_coords"],
+                point_labels=data["labels"],
+                mask_input=numbin.loads(data["mask"])[None, :, :],
+                multimask_output=False,
+            )
+        return numbin.dumps(masks[0])
+
+
+if __name__ == "__main__":
+    server = Server()
+    server.append_worker(SegmentAnything, num=1, max_batch_size=1)
+    server.run()
diff --git a/mosec/args.py b/mosec/args.py
@@ -134,6 +134,13 @@ def build_arguments_parser() -> argparse.ArgumentParser:
         "This will omit the worker number for each stage.",
         action="store_true",
     )
+
+    parser.add_argument(
+        "--compression",
+        help="Enable `zstd` & `gzip` compression for the request & response",
+        action="store_true",
+    )
+
     return parser
 
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,6 +23,7 @@ classifiers = [
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
     "Programming Language :: Python :: Implementation :: CPython",
     "Programming Language :: Rust",
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -7,3 +7,4 @@ ruff>=0.7
 pre-commit>=2.15.0
 httpx[http2]==0.27.2
 httpx-sse==0.4.0
+zstandard~=0.23
diff --git a/src/config.rs b/src/config.rs
@@ -65,6 +65,8 @@ pub(crate) struct Config {
     pub namespace: String,
     // log level: (debug, info, warning, error)
     pub log_level: String,
+    // `zstd` & `gzip` compression
+    pub compression: bool,
     pub runtimes: Vec<Runtime>,
     pub routes: Vec<Route>,
 }
@@ -79,6 +81,7 @@ impl Default for Config {
             port: 8000,
             namespace: String::from("mosec_service"),
             log_level: String::from("info"),
+            compression: false,
             runtimes: vec![Runtime {
                 max_batch_size: 64,
                 max_wait_time: 3000,
diff --git a/src/main.rs b/src/main.rs
diff --git a/src/routes.rs b/src/routes.rs
diff --git a/tests/test_service.py b/tests/test_service.py