Making faster whisper input more uniform (#218)

htrivedi99 · web-flow · commit 9fd460220461 · 2024-02-27T13:04:26.000-08:00
diff --git a/templates/faster-whisper-truss/model/model.py b/templates/faster-whisper-truss/model/model.py
@@ -1,3 +1,4 @@
+import base64
 from tempfile import NamedTemporaryFile
 from typing import Any, Dict
 
@@ -16,13 +17,37 @@ def load(self):
         self._model = WhisperModel(self._config["model_metadata"]["model_id"])
 
     def preprocess(self, request: Dict) -> Dict:
-        resp = requests.get(request["url"])
-        return {"response": resp.content}
+        audio_base64 = request.get("audio")
+        url = request.get("url")
+
+        if audio_base64 and url:
+            return {
+                "error": "Only a base64 audio file OR a URL can be passed to the API, not both of them.",
+            }
+        if not audio_base64 and not url:
+            return {
+                "error": "Please provide either an audio file in base64 string format or a URL to an audio file.",
+            }
+
+        binary_data = None
+
+        if audio_base64:
+            binary_data = base64.b64decode(audio_base64)
+        elif url:
+            resp = requests.get(url)
+            binary_data = resp.content
+
+        return {"data": binary_data}
 
     def predict(self, request: Dict) -> Dict:
+        if request.get("error"):
+            return request
+
+        audio_data = request.get("data")
         result_segments = []
+
         with NamedTemporaryFile() as fp:
-            fp.write(request["response"])
+            fp.write(audio_data)
             segments, info = self._model.transcribe(
                 fp.name,
                 temperature=0,
diff --git a/whisper/faster-whisper-v2/model/model.py b/whisper/faster-whisper-v2/model/model.py
@@ -1,3 +1,4 @@
+import base64
 from tempfile import NamedTemporaryFile
 from typing import Any, Dict
 
@@ -16,13 +17,37 @@ def load(self):
         self._model = WhisperModel(self._config["model_metadata"]["model_id"])
 
     def preprocess(self, request: Dict) -> Dict:
-        resp = requests.get(request["url"])
-        return {"response": resp.content}
+        audio_base64 = request.get("audio")
+        url = request.get("url")
+
+        if audio_base64 and url:
+            return {
+                "error": "Only a base64 audio file OR a URL can be passed to the API, not both of them.",
+            }
+        if not audio_base64 and not url:
+            return {
+                "error": "Please provide either an audio file in base64 string format or a URL to an audio file.",
+            }
+
+        binary_data = None
+
+        if audio_base64:
+            binary_data = base64.b64decode(audio_base64)
+        elif url:
+            resp = requests.get(url)
+            binary_data = resp.content
+
+        return {"data": binary_data}
 
     def predict(self, request: Dict) -> Dict:
+        if request.get("error"):
+            return request
+
+        audio_data = request.get("data")
         result_segments = []
+
         with NamedTemporaryFile() as fp:
-            fp.write(request["response"])
+            fp.write(audio_data)
             segments, info = self._model.transcribe(
                 fp.name,
                 temperature=0,
diff --git a/whisper/faster-whisper-v3/model/model.py b/whisper/faster-whisper-v3/model/model.py
@@ -1,3 +1,4 @@
+import base64
 from tempfile import NamedTemporaryFile
 from typing import Any, Dict
 
@@ -16,13 +17,37 @@ def load(self):
         self._model = WhisperModel(self._config["model_metadata"]["model_id"])
 
     def preprocess(self, request: Dict) -> Dict:
-        resp = requests.get(request["url"])
-        return {"response": resp.content}
+        audio_base64 = request.get("audio")
+        url = request.get("url")
+
+        if audio_base64 and url:
+            return {
+                "error": "Only a base64 audio file OR a URL can be passed to the API, not both of them.",
+            }
+        if not audio_base64 and not url:
+            return {
+                "error": "Please provide either an audio file in base64 string format or a URL to an audio file.",
+            }
+
+        binary_data = None
+
+        if audio_base64:
+            binary_data = base64.b64decode(audio_base64)
+        elif url:
+            resp = requests.get(url)
+            binary_data = resp.content
+
+        return {"data": binary_data}
 
     def predict(self, request: Dict) -> Dict:
+        if request.get("error"):
+            return request
+
+        audio_data = request.get("data")
         result_segments = []
+
         with NamedTemporaryFile() as fp:
-            fp.write(request["response"])
+            fp.write(audio_data)
             segments, info = self._model.transcribe(
                 fp.name,
                 temperature=0,