feat(relieve): discard request if the caller is not waiting for the answer anymore*

oOraph · oOraph · commit 52511c0c5aa0 · 2025-09-19T12:20:05.000+02:00
When behind a proxy this requires the proxy to close the connection to be effective though

Signed-off-by: Raphael Glon &lt;oOraph@users.noreply.github.com&gt;
diff --git a/requirements.txt b/requirements.txt
@@ -20,3 +20,4 @@ torch==2.5.1
 torchvision
 torchaudio
 peft==0.15.1
+psutil>=6.0.0
diff --git a/src/huggingface_inference_toolkit/handler.py b/src/huggingface_inference_toolkit/handler.py
@@ -5,7 +5,12 @@
 from huggingface_inference_toolkit import logging
 from huggingface_inference_toolkit.const import HF_TRUST_REMOTE_CODE
 from huggingface_inference_toolkit.env_utils import api_inference_compat, ignore_custom_handler
-from huggingface_inference_toolkit.utils import check_and_register_custom_pipeline_from_directory
+from huggingface_inference_toolkit.logging import logger
+from huggingface_inference_toolkit.utils import (
+    already_left,
+    check_and_register_custom_pipeline_from_directory,
+    should_discard_left,
+)
 
 
 class HuggingFaceHandler:
@@ -39,7 +44,17 @@ def __call__(self, data: Dict[str, Any]):
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", {})
 
-        # diffusers and sentence transformers pipelines do not have the `task` arg
+        if "handler_params" in data:
+            handler_params = data.pop("handler_params")
+            if should_discard_left():
+                request = handler_params.get("request")
+                if not request:
+                    logger.warn("Cannot know if request caller already left, missing request handler param")
+                elif already_left(request):
+                    logger.info("Discarding request as the caller already left")
+                    return None
+
+    # diffusers and sentence transformers pipelines do not have the `task` arg
         if not hasattr(self.pipeline, "task"):
             # sentence transformers parameters not supported yet
             if any(isinstance(self.pipeline, v) for v in SENTENCE_TRANSFORMERS_TASKS.values()):
@@ -168,7 +183,7 @@ def __call__(self, data: Dict[str, Any]):
                             scores = resp['scores']
                             if len(labels) == len(scores):
                                 new_resp = []
-                                for label, score in zip(labels, scores):
+                                for label, score in zip(labels, scores, strict=True):
                                     new_resp.append({"label": label, "score": score})
                                 resp = new_resp
                             else:
diff --git a/src/huggingface_inference_toolkit/heavy_utils.py b/src/huggingface_inference_toolkit/heavy_utils.py
@@ -184,4 +184,4 @@ def get_pipeline(
         hf_pipeline.model.config.forced_decoder_ids = hf_pipeline.tokenizer.get_decoder_prompt_ids(
             language="english", task="transcribe"
         )
-    return hf_pipeline  # type: ignore
+    return hf_pipeline  # type: ignore
diff --git a/src/huggingface_inference_toolkit/serialization/base.py b/src/huggingface_inference_toolkit/serialization/base.py
@@ -38,7 +38,7 @@ class ContentType:
     @staticmethod
     def get_deserializer(content_type: str, task: str):
         if not content_type:
-            message = f"No content type provided and no default one configured."
+            message = "No content type provided and no default one configured."
             raise Exception(message)
         if content_type.lower().startswith("application/octet-stream"):
             if "audio" in task or "speech" in task:
diff --git a/src/huggingface_inference_toolkit/utils.py b/src/huggingface_inference_toolkit/utils.py
@@ -1,7 +1,12 @@
 import importlib.util
+import ipaddress
+import os
 import sys
 from pathlib import Path
 
+import psutil
+from starlette.requests import Request
+
 from huggingface_inference_toolkit.const import HF_DEFAULT_PIPELINE_NAME, HF_MODULE_NAME
 from huggingface_inference_toolkit.logging import logger
 
@@ -66,7 +71,7 @@ def check_and_register_custom_pipeline_from_directory(model_dir):
             # init custom handler with model_dir
             custom_pipeline = handler.EndpointHandler(model_dir)
         else:
-            logger.info(f"No spec from file location found for module %s, file %s", HF_MODULE_NAME, custom_module)
+            logger.info("No spec from file location found for module %s, file %s", HF_MODULE_NAME, custom_module)
     elif legacy_module.is_file():
         logger.warning(
             """You are using a legacy custom pipeline.
@@ -99,3 +104,63 @@ def convert_params_to_int_or_bool(params):
         if v == "true":
             params[k] = True
     return params
+
+
+def should_discard_left() -> bool:
+    return os.getenv('DISCARD_LEFT', '0').lower() in ['true', 'yes', '1']
+
+
+def already_left(request: Request) -> bool:
+    """
+    Check if the caller has already left without waiting for the answer to come. This can help during burst to relieve
+    the pressure on the worker by cancelling jobs whose results don't matter as they won't be fetched anyway
+    :param request:
+    :return: bool
+    """
+    # NOTE: Starlette method request.is_disconnected is totally broken, consumes the payload, does not return
+    # the correct status. So we use the good old way to identify if the caller is still there.
+    # In any case, if we are not sure, we return False
+    logger.info("Checking if request caller already left")
+    try:
+        client = request.client
+        host = client.host
+        if not host:
+            return False
+
+        port = int(client.port)
+        host = ipaddress.ip_address(host)
+
+        if port <= 0 or port > 65535:
+            logger.warning("Unexpected source port format for caller %s", port)
+            return False
+        counter = 0
+        for connection in psutil.net_connections(kind="tcp"):
+            counter += 1
+            if connection.status != "ESTABLISHED":
+                continue
+            if not connection.raddr:
+                continue
+            if int(connection.raddr.port) != port:
+                continue
+            if (
+                    not connection.raddr.ip
+                    or ipaddress.ip_address(connection.raddr.ip) != host
+            ):
+                continue
+            logger.info(
+                "Found caller connection still established, caller is most likely still there, %s",
+                connection,
+            )
+            return False
+    except Exception as e:
+        logger.warning(
+            "Unexpected error while checking if caller already left, assuming still there"
+        )
+        logger.exception(e)
+        return False
+
+    logger.info(
+        "%d connections checked. No connection found matching to the caller, probably left",
+        counter,
+    )
+    return True
diff --git a/src/huggingface_inference_toolkit/webservice_starlette.py b/src/huggingface_inference_toolkit/webservice_starlette.py
@@ -22,12 +22,13 @@
 )
 from huggingface_inference_toolkit.env_utils import api_inference_compat
 from huggingface_inference_toolkit.handler import (
+    HuggingFaceHandler,
     get_inference_handler_either_custom_or_default_handler,
 )
 from huggingface_inference_toolkit.logging import logger
 from huggingface_inference_toolkit.serialization.base import ContentType
 from huggingface_inference_toolkit.serialization.json_utils import Jsoner
-from huggingface_inference_toolkit.utils import convert_params_to_int_or_bool
+from huggingface_inference_toolkit.utils import convert_params_to_int_or_bool, should_discard_left
 from huggingface_inference_toolkit.vertex_ai_utils import _load_repository_from_gcs
 
 INFERENCE_HANDLERS = {}
@@ -101,6 +102,7 @@ async def metrics(request):
 
 async def predict(request):
     global INFERENCE_HANDLERS
+
     if not MODEL_DOWNLOADED:
         with MODEL_DL_LOCK:
             _eager_model_dl()
@@ -154,6 +156,10 @@ async def predict(request):
         # tracks request time
         start_time = perf_counter()
 
+        if should_discard_left() and isinstance(inference_handler, HuggingFaceHandler):
+            deserialized_body['handler_params'] = {
+                'request': request
+            }
         with idle.request_witnesses():
             # run async not blocking call
             pred = await async_handler_call(inference_handler, deserialized_body)
@@ -163,6 +169,10 @@ async def predict(request):
             f"POST {request.url.path} | Duration: {(perf_counter()-start_time) *1000:.2f} ms"
         )
 
+        if should_discard_left() and pred is None:
+            logger.info("No content returned as caller already left")
+            return Response(status_code=204)
+
         # response extracts content from request
         accept = request.headers.get("accept")
         if accept is None or accept == "*/*":

Original file line number	Diff line number	Diff line change
`@@ -184,4 +184,4 @@ def get_pipeline(`
`184`	`184`	`hf_pipeline.model.config.forced_decoder_ids = hf_pipeline.tokenizer.get_decoder_prompt_ids(`
`185`	`185`	`language="english", task="transcribe"`
`186`	`186`	`)`
`187`		`- return hf_pipeline # type: ignore`
	`187`	`+ return hf_pipeline # type: ignore`