Whisper Torchserve truss (#225)

htrivedi99 · web-flow · commit 0487fe90e6a8 · 2024-02-29T14:57:23.000-08:00
diff --git a/whisper/whisper-torchserve/README.md b/whisper/whisper-torchserve/README.md
@@ -0,0 +1,82 @@
+# Whisper Torchserve
+
+This truss allows you to run a whisper model using [torchserve](https://pytorch.org/serve/) as the backend on truss.
+
+
+## Deployment
+
+Before deployment:
+
+1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
+2. Install the latest version of Truss: `pip install --upgrade truss`
+
+With `whisper/whisper-torchserve` as your working directory, you can deploy the model with:
+
+```
+truss push
+```
+
+Paste your Baseten API key if prompted.
+
+For more information, see [Truss documentation](https://truss.baseten.co).
+
+## Model Inputs
+
+The model takes in one input:
+- __audio__: An audio file as a base64 string
+
+## Few thing to note
+Torchserve requires a compiled `.mar` file in order to serve the model. Here is a [README](https://github.com/pytorch/serve/blob/master/model-archiver/README.md) providing a brief explanation for generating this file. Once the `.mar` file is generated it needs to get placed in the `data/model_store` directory. Also in the `data/` directory is a configuration file for torchserve called `config.properties`. That file looks something like this:
+
+```
+inference_address=http://0.0.0.0:8888
+batch_size=4
+ipex_enable=true
+async_logging=true
+
+models={\
+  "whisper_base": {\
+    "1.0": {\
+        "defaultVersion": true,\
+        "marName": "whisper_base.mar",\
+        "minWorkers": 1,\
+        "maxWorkers": 2,\
+        "batchSize": 4,\
+        "maxBatchDelay": 500,\
+        "responseTimeout": 24\
+    }\
+  }\
+}
+```
+
+Here you can specify the `batchSize` as well as the name of your mar file using `marName`. When torchserve starts, it will looks for the mar file inside the `data/model_store` directory with the `marName` defined above.
+
+## Invoking the model
+
+Here is an example in Python:
+
+```python
+import requests
+import base64
+
+def wav_to_base64(file_path):
+    with open(file_path, "rb") as wav_file:
+        binary_data = wav_file.read()
+        base64_data = base64.b64encode(binary_data)
+        base64_string = base64_data.decode("utf-8")
+        return base64_string
+
+resp = requests.post(
+    "https://model-<model-id>.api.baseten.co/development/predict",
+    headers={"Authorization": "Api-Key BASETEN-API-KEY"},
+    json={"audio": wav_to_base64("/path/to/audio-file/60-sec.wav")},
+)
+
+print(resp.json())
+```
+
+Here is a sample output:
+
+```json
+{"output": "Let me make it clear. His conduct is unacceptable. He's unfit. And be careful of what you're gonna get. He doesn't care for the American people. It's Donald Trump first. This is what I want people to understand. These people have... I mean, she has no idea what the hell the names of those provinces are, but she wants to send our sons and daughters and our troops and our military equipment to go fight it. Look at the blank expression. She doesn't know the names of the provinces. You do this at every debate. You say, no, don't interrupt me. I didn't interrupt you."}
+```
diff --git a/whisper/whisper-torchserve/config.yaml b/whisper/whisper-torchserve/config.yaml
@@ -0,0 +1,23 @@
+environment_variables: {}
+external_package_dirs: []
+model_metadata: {}
+model_name: Whisper Torchserve
+python_version: py310
+requirements:
+  - torch==2.1.0
+  - torchserve==0.9.0
+  - ffmpeg-python==0.2.0
+  - transformers==4.37.2
+  - nvgpu==0.10.0
+  - httpx==0.27.0
+resources:
+  accelerator: T4
+  use_gpu: true
+model_cache:
+  - repo_id: htrivedi99/whisper-torchserve
+secrets: {}
+system_packages:
+  - ffmpeg
+  - openjdk-11-jdk
+runtime:
+  predict_concurrency: 128
diff --git a/whisper/whisper-torchserve/data/config.properties b/whisper/whisper-torchserve/data/config.properties
@@ -0,0 +1,20 @@
+inference_address=http://0.0.0.0:8888
+batch_size=4
+ipex_enable=true
+async_logging=true
+
+models={\
+  "whisper_base": {\
+    "1.0": {\
+        "defaultVersion": true,\
+        "marName": "whisper_base.mar",\
+        "minWorkers": 1,\
+        "maxWorkers": 2,\
+        "batchSize": 4,\
+        "maxBatchDelay": 500,\
+        "responseTimeout": 24\
+    }\
+  }\
+}
+
+# default_workers_per_model=2
diff --git a/whisper/whisper-torchserve/model/__init__.py b/whisper/whisper-torchserve/model/__init__.py
diff --git a/whisper/whisper-torchserve/model/model.py b/whisper/whisper-torchserve/model/model.py
@@ -0,0 +1,56 @@
+import base64
+import multiprocessing
+import os
+import subprocess
+from typing import Dict
+
+import httpx
+import requests
+from huggingface_hub import snapshot_download
+
+TORCHSERVE_ENDPOINT = "http://0.0.0.0:8888/predictions/whisper_base"
+
+
+class Model:
+    def __init__(self, **kwargs):
+        self._data_dir = kwargs["data_dir"]
+        self._model = None
+
+    def start_tochserver(self):
+        subprocess.run(
+            [
+                "torchserve",
+                "--start",
+                "--model-store",
+                f"{self._data_dir}/model_store",
+                "--models",
+                "whisper_base.mar",
+                "--foreground",
+                "--no-config-snapshots",
+                "--ts-config",
+                f"{self._data_dir}/config.properties",
+            ],
+            check=True,
+        )
+
+    def load(self):
+        snapshot_download(
+            "htrivedi99/whisper-torchserve",
+            local_dir=os.path.join(self._data_dir, "model_store"),
+            max_workers=4,
+        )
+        print("Downloaded weights succesfully!")
+
+        process = multiprocessing.Process(target=self.start_tochserver)
+        process.start()
+
+    async def predict(self, request: Dict):
+        audio_base64 = request.get("audio")
+        audio_bytes = base64.b64decode(audio_base64)
+
+        async with httpx.AsyncClient() as client:
+            res = await client.post(
+                TORCHSERVE_ENDPOINT, files={"data": (None, audio_bytes)}
+            )
+            transcription = res.text
+        return {"output": transcription}