perf: improve throughput with threaded model execution

MrPandir · MrPandir · commit d9fa4b74e737 · 2025-04-09T21:00:03.000+02:00
Use threading to run multiple predictions in parallel, increasing request processing throughput.
diff --git a/app/model.py b/app/model.py
@@ -1,4 +1,6 @@
+import asyncio
 import logging
+from concurrent.futures import ThreadPoolExecutor
 from os import cpu_count, environ
 
 import torch
@@ -9,6 +11,9 @@
 
 from .utils import clear_text, measure_time
 
+loop = asyncio.get_running_loop()
+loop.set_default_executor(ThreadPoolExecutor())
+
 # Environment
 load_dotenv()
 
@@ -142,7 +147,7 @@ def call_model(text: str) -> Tensor:
     return outputs.logits
 
 
-def predict(text: str) -> bool:
+def sync_predict(text: str) -> bool:
     text = clear_text(text).lower()
     if not text:
         return False
@@ -157,3 +162,7 @@ def predict(text: str) -> bool:
 
     log_prediction(text, logits, result, execution_time)
     return result
+
+
+async def async_predict(text: str) -> bool:
+    return await loop.run_in_executor(None, sync_predict, text)
diff --git a/app/server.py b/app/server.py
@@ -1,4 +1,3 @@
-import asyncio
 import time
 
 from prometheus_client import (
@@ -14,8 +13,8 @@
 from starlette.responses import PlainTextResponse, Response
 from starlette.routing import Route, Router
 
+from .model import async_predict as model_predict
 from .model import metrics_prefix
-from .model import predict as call_model
 
 # Initialize Prometheus metrics
 disable_created_metrics()
@@ -44,7 +43,7 @@ async def predict(request: Request):
 
     try:
         text = request.query_params.get("text")
-        result = await asyncio.to_thread(call_model, text) if text else False
+        result = await model_predict(text) if text else False
 
         label = "toxic" if result else "non_toxic"
         REQUEST_COUNT.labels(endpoint=endpoint, result=label).inc()