Fix Windows Threading Issues (#385)

FrsECM · pre-commit-ci[bot] · aniketmaurya · web-flow · commit c7d8d2ff2b8d · 2025-04-30T15:02:16.000+01:00
* Fix bug on windows with uvicorn when multiple workers. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Force socket to listen before starting server * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix Ctrl+C on windows * Update src/litserve/server.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix comments - Ctrl+C on Windows * Update src/litserve/server.py Fix Type hint Co-authored-by: Aniket Maurya <theaniketmaurya@gmail.com> * Update src/litserve/server.py Remove windows comment. Co-authored-by: Aniket Maurya <theaniketmaurya@gmail.com> * Fix threading import Thread * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Increase test timeout => 30mn * Fix default self._uvicorn_servers * Fix sockets iteration * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * No Need to catch Keyboard Interrupt on windows. just close Threads * Update Timeout + testing CICD * Fix timeout for gpu tests * Fix KeyboardInterrupt Windows * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * KeyboardInterrupt Windows - MultipleWorkers * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Move pid detection to LitLoop for less intrusivity * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Revert changes in CICD * Remove lock (useless) * Apply suggestions from code review --------- Co-authored-by: Francois Ponchon <francois.ponchon@michelin.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Aniket Maurya <theaniketmaurya@gmail.com>
diff --git a/src/litserve/loops/base.py b/src/litserve/loops/base.py
@@ -14,7 +14,9 @@
 import asyncio
 import inspect
 import logging
+import os
 import pickle
+import signal
 import sys
 import time
 from abc import ABC
@@ -212,6 +214,15 @@ def run(
 class LitLoop(_BaseLoop):
     def __init__(self):
         self._context = {}
+        self._server_pid = os.getpid()
+
+    def kill(self):
+        try:
+            print(f"Stop Server Requested - Kill parent pid [{self._server_pid}] from [{os.getpid()}]")
+            os.kill(self._server_pid, signal.SIGTERM)
+        except PermissionError:
+            # Access Denied because pid already killed...
+            return
 
     def get_batch_requests(
         self,
diff --git a/src/litserve/loops/simple_loops.py b/src/litserve/loops/simple_loops.py
@@ -42,6 +42,9 @@ def run_single_loop(
                 response_queue_id, uid, timestamp, x_enc = request_queue.get(timeout=1.0)
             except (Empty, ValueError):
                 continue
+            except KeyboardInterrupt:  # pragma: no cover
+                self.kill()
+                return
 
             if (lit_api.request_timeout and lit_api.request_timeout != -1) and (
                 time.monotonic() - timestamp > lit_api.request_timeout
@@ -213,7 +216,9 @@ def run_batched_loop(
                         PickleableHTTPException.from_exception(e),
                         LitAPIStatus.ERROR,
                     )
-
+            except KeyboardInterrupt:  # pragma: no cover
+                self.kill()
+                return
             except Exception as e:
                 logger.exception(
                     "LitAPI ran into an error while processing the batched request.\n"
diff --git a/src/litserve/loops/streaming_loops.py b/src/litserve/loops/streaming_loops.py
@@ -97,6 +97,9 @@ def run_streaming_loop(
                     PickleableHTTPException.from_exception(e),
                     LitAPIStatus.ERROR,
                 )
+            except KeyboardInterrupt:  # pragma: no cover
+                self.kill()
+                return
             except Exception as e:
                 logger.exception(
                     "LitAPI ran into an error while processing the streaming request uid=%s.\n"
@@ -185,6 +188,9 @@ def run_batched_streaming_loop(
 
                 for response_queue_id, uid in zip(response_queue_ids, uids):
                     self.put_response(transport, response_queue_id, uid, "", LitAPIStatus.FINISH_STREAMING)
+            except KeyboardInterrupt:  # pragma: no cover
+                self.kill()
+                return
 
             except HTTPException as e:
                 for response_queue_id, uid in zip(response_queue_ids, uids):
diff --git a/src/litserve/server.py b/src/litserve/server.py
@@ -26,9 +26,12 @@
 import warnings
 from collections import deque
 from contextlib import asynccontextmanager
+from multiprocessing.context import Process
+from threading import Thread
 from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
 
 import uvicorn
+import uvicorn.server
 from fastapi import Depends, FastAPI, HTTPException, Request, Response
 from fastapi.responses import JSONResponse, StreamingResponse
 from fastapi.security import APIKeyHeader
@@ -176,7 +179,6 @@ def __init__(
                 DeprecationWarning,
                 stacklevel=2,
             )
-
             lit_api.max_batch_size = max_batch_size
             lit_api.batch_timeout = batch_timeout
         if isinstance(spec, LitSpec):
@@ -341,6 +343,7 @@ def launch_inference_worker(self, num_uvicorn_servers: int):
                 ),
             )
             process.start()
+            print(f"Inference Worker {worker_id} - [{process.pid}]")
             process_list.append(process)
         return manager, process_list
 
@@ -599,20 +602,28 @@ def run(
         elif api_server_worker_type is None:
             api_server_worker_type = "process"
 
-        manager, litserve_workers = self.launch_inference_worker(num_api_servers)
+        manager, inference_workers = self.launch_inference_worker(num_api_servers)
 
         self.verify_worker_status()
         try:
-            servers = self._start_server(port, num_api_servers, log_level, sockets, api_server_worker_type, **kwargs)
+            uvicorn_workers = self._start_server(
+                port, num_api_servers, log_level, sockets, api_server_worker_type, **kwargs
+            )
             print(f"Swagger UI is available at http://0.0.0.0:{port}/docs")
-            for s in servers:
-                s.join()
+            # On Linux, kill signal will be captured by uvicorn.
+            # => They will join and raise a KeyboardInterrupt, allowing to Shutdown server.
+            for i, uw in enumerate(uvicorn_workers):
+                uw: Union[Process, Thread]
+                if isinstance(uw, Process):
+                    print(f"Uvicorn worker {i} : [{uw.pid}]")
+                uw.join()
         finally:
             print("Shutting down LitServe")
             self._transport.close()
-            for w in litserve_workers:
-                w.terminate()
-                w.join()
+            for iw in inference_workers:
+                iw: Process
+                iw.terminate()
+                iw.join()
             manager.shutdown()
 
     def _prepare_app_run(self, app: FastAPI):
@@ -622,16 +633,24 @@ def _prepare_app_run(self, app: FastAPI):
         app.add_middleware(RequestCountMiddleware, active_counter=active_counter)
 
     def _start_server(self, port, num_uvicorn_servers, log_level, sockets, uvicorn_worker_type, **kwargs):
-        servers = []
+        workers = []
         for response_queue_id in range(num_uvicorn_servers):
             self.app.response_queue_id = response_queue_id
             if self.lit_spec:
                 self.lit_spec.response_queue_id = response_queue_id
             app: FastAPI = copy.copy(self.app)
 
             self._prepare_app_run(app)
-
             config = uvicorn.Config(app=app, host="0.0.0.0", port=port, log_level=log_level, **kwargs)
+            if sys.platform == "win32" and num_uvicorn_servers > 1:
+                logger.debug("Enable Windows explicit socket sharing...")
+                # We make sure sockets is listening...
+                # It prevents further [WinError 10022]
+                for sock in sockets:
+                    sock.listen(config.backlog)
+                # We add worker to say unicorn to use a shared socket (win32)
+                # https://github.com/encode/uvicorn/pull/802
+                config.workers = num_uvicorn_servers
             server = uvicorn.Server(config=config)
             if uvicorn_worker_type == "process":
                 ctx = mp.get_context("fork")
@@ -641,8 +660,8 @@ def _start_server(self, port, num_uvicorn_servers, log_level, sockets, uvicorn_w
             else:
                 raise ValueError("Invalid value for api_server_worker_type. Must be 'process' or 'thread'")
             w.start()
-            servers.append(w)
-        return servers
+            workers.append(w)
+        return workers
 
     def setup_auth(self):
         if hasattr(self.lit_api, "authorize") and callable(self.lit_api.authorize):