lv2v: Restart infer.py process and cleanup previous stream quickly (#527)

victorges · web-flow · commit 6a59ca1a4b05 · 2025-04-10T19:35:57.000-03:00
* live/api: Cleanup multipart temp files dir

As I was understanding the code to add the last params
file I cleaned up usage of that other one which was a
little confusing (I wrote it myself haha) and had no
explanations.

* live/api: You know what? Remove multipart altogether

We don't use it anymore, the runner API talks to us only
in JSON.

* runner/api: Cleanup previous stream trickle channels on start

* runner/app: Restart infer.py process on crashes

* runner/app: Add a 1s grace period for process startup

* lv2v: Final fixes from testing

turns out the stdout/err streams dont close automatically when
the process exits...

Tested many ways but python process management is really bad.
Had to workaround a potential thread leak that could happen.

Also joined STDERR and STDOUT again ona single stream as that
seemed less error prone. Tested that both are still streamed.
diff --git a/runner/app/live/api/api.py b/runner/app/live/api/api.py
@@ -1,24 +1,25 @@
 import asyncio
-import hashlib
 import logging
-import mimetypes
+import json
 import os
 import tempfile
 import time
 from typing import Optional, cast
 
-from aiohttp import BodyPartReader, web
+from aiohttp import web
 from pydantic import BaseModel, Field
 from typing import Annotated, Dict
 
 from streamer import PipelineStreamer, ProcessGuardian
 from streamer.protocol.trickle import TrickleProtocol
 from streamer.process import config_logging
 
-TEMP_SUBDIR = "infer_temp"
 MAX_FILE_AGE = 86400  # 1 day
 STREAMER_INPUT_TIMEOUT = 60  # 60s
 
+# File to store the last params that a stream was started with. Used to cleanup
+# left over resources (e.g. trickle channels) left by a crashed process.
+last_params_file = os.path.join(tempfile.gettempdir(), "ai_runner_last_params.json")
 
 class StartStreamParams(BaseModel):
     subscribe_url: Annotated[
@@ -62,44 +63,32 @@ class StartStreamParams(BaseModel):
         Field(default="", description="Unique identifier for the stream."),
     ]
 
+async def cleanup_last_stream():
+    if not os.path.exists(last_params_file):
+        logging.debug("No last stream params found to cleanup")
+        return
 
-def cleanup_old_files(temp_dir):
-    current_time = time.time()
-    for filename in os.listdir(temp_dir):
-        file_path = os.path.join(temp_dir, filename)
-        if os.path.isfile(file_path):
-            file_age = current_time - os.path.getmtime(file_path)
-            if file_age > MAX_FILE_AGE:
-                os.remove(file_path)
-                logging.info(f"Removed old file: {file_path}")
+    try:
+        with open(last_params_file, "r") as f:
+            params = StartStreamParams(**json.load(f))
+        os.remove(last_params_file)
 
+        logging.info(f"Cleaning up last stream trickle channels for request_id={params.request_id} subscribe_url={params.subscribe_url} publish_url={params.publish_url} control_url={params.control_url} events_url={params.events_url}")
+        protocol = TrickleProtocol(
+            params.subscribe_url,
+            params.publish_url,
+            params.control_url,
+            params.events_url,
+        )
+        # Start and stop the protocol to immediately to make sure trickle channels are closed.
+        await protocol.start()
+        await protocol.stop()
+    except:
+        logging.exception(f"Error cleaning up last stream trickle channels")
 
-async def parse_request_data(request: web.Request, temp_dir: str) -> Dict:
+async def parse_request_data(request: web.Request) -> Dict:
     if request.content_type.startswith("application/json"):
         return await request.json()
-    elif request.content_type.startswith("multipart/"):
-        params_data = {}
-        reader = await request.multipart()
-        async for part in reader:
-            if not isinstance(part, BodyPartReader):
-                continue
-            elif part.name == "params":
-                part_data = await part.json()
-                if part_data:
-                    params_data.update(part_data)
-            else:
-                content = await part.read()
-                file_hash = hashlib.md5(content).hexdigest()
-                content_type = part.headers.get(
-                    "Content-Type", "application/octet-stream"
-                )
-                ext = mimetypes.guess_extension(content_type) or ""
-                new_filename = f"{file_hash}{ext}"
-                file_path = os.path.join(temp_dir, new_filename)
-                with open(file_path, "wb") as f:
-                    f.write(content)
-                params_data[part.name] = file_path
-        return params_data
     else:
         raise ValueError(f"Unknown content type: {request.content_type}")
 
@@ -118,13 +107,15 @@ async def handle_start_stream(request: web.Request):
                 logging.error(f"Timeout stopping streamer: {e}")
                 raise web.HTTPBadRequest(text="Timeout stopping previous streamer")
 
-        temp_dir = os.path.join(tempfile.gettempdir(), TEMP_SUBDIR)
-        os.makedirs(temp_dir, exist_ok=True)
-        cleanup_old_files(temp_dir)
-
-        params_data = await parse_request_data(request, temp_dir)
+        params_data = await parse_request_data(request)
         params = StartStreamParams(**params_data)
 
+        try:
+            with open(last_params_file, "w") as f:
+                json.dump(params.model_dump(), f)
+        except Exception as e:
+            logging.error(f"Error saving last params to file: {e}")
+
         config_logging(request_id=params.request_id, stream_id=params.stream_id)
 
         protocol = TrickleProtocol(
@@ -156,11 +147,7 @@ async def handle_start_stream(request: web.Request):
 
 async def handle_params_update(request: web.Request):
     try:
-        temp_dir = os.path.join(tempfile.gettempdir(), TEMP_SUBDIR)
-        os.makedirs(temp_dir, exist_ok=True)
-        cleanup_old_files(temp_dir)
-
-        params = await parse_request_data(request, temp_dir)
+        params = await parse_request_data(request)
 
         process = cast(ProcessGuardian, request.app["process"])
         await process.update_params(params)
@@ -180,6 +167,8 @@ async def handle_get_status(request: web.Request):
 async def start_http_server(
     port: int, process: ProcessGuardian, streamer: Optional[PipelineStreamer] = None
 ):
+    asyncio.create_task(cleanup_last_stream())
+
     app = web.Application()
     app["process"] = process
     app["streamer"] = streamer
diff --git a/runner/app/pipelines/live_video_to_video.py b/runner/app/pipelines/live_video_to_video.py
@@ -25,16 +25,8 @@ def __init__(self, model_id: str):
         self.infer_script_path = (
             Path(__file__).parent.parent / "live" / "infer.py"
         )
-        try:
-            logging.info("Starting pipeline process")
-            self.start_process(
-                pipeline=self.model_id,  # we use the model_id as the pipeline name for now
-                http_port=8888,
-                # TODO: set torch device from self.torch_device
-            )
-        except Exception as e:
-            raise InferenceError(original_exception=e)
-
+        self.restart_count = 0
+        self.start_process()
 
     def __call__(  # type: ignore
         self, *, subscribe_url: str, publish_url: str, control_url: str, events_url: str, params: dict, request_id: str, stream_id: str, **kwargs
@@ -106,37 +98,34 @@ class PipelineStatus(BaseModel):
             threading.Thread(target=lambda: self.log_process_diagnostics(full=True)).start()
             raise ConnectionError(f"Failed to get status: {e}")
 
-    def start_process(self, **kwargs):
+    def start_process(self):
+        logging.info("Starting pipeline process")
         cmd = [sys.executable, str(self.infer_script_path)]
-
-        # Add any additional kwargs as command-line arguments
-        for key, value in kwargs.items():
-            kebab_key = key.replace("_", "-")
-            if isinstance(value, str):
-                escaped_value = str(value).replace("'", "'\\''")
-                cmd.extend([f"--{kebab_key}", f"{escaped_value}"])
-            else:
-                cmd.extend([f"--{kebab_key}", f"{value}"])
+        cmd.extend(["--pipeline", self.model_id]) # we use the model_id as the pipeline name for now
+        cmd.extend(["--http-port", "8888"])
+        # TODO: set torch device from self.torch_device
 
         env = os.environ.copy()
         env["HUGGINGFACE_HUB_CACHE"] = str(self.model_dir)
 
         try:
             self.process = subprocess.Popen(
-                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, env=env
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, env=env
             )
 
             self.monitor_thread = threading.Thread(target=self.monitor_process)
             self.monitor_thread.start()
-            self.stdout_log_thread = threading.Thread(target=log_output, args=(self.process.stdout,))
-            self.stdout_log_thread.start()
-            self.stderr_log_thread = threading.Thread(target=log_output, args=(self.process.stderr,))
-            self.stderr_log_thread.start()
+            self.log_thread = threading.Thread(target=log_output, daemon=True, args=(self.process.stdout,))
+            self.log_thread.start()
 
         except subprocess.CalledProcessError as e:
             raise InferenceError(f"Error starting infer.py: {e}")
 
     def monitor_process(self):
+        # Wait 1 sec before starting to monitor the process. This gives it some
+        # time to start and also ensures we won't restart the process too often.
+        time.sleep(1)
+
         while True:
             if not self.process:
                 logging.error("No process to monitor")
@@ -157,11 +146,33 @@ def monitor_process(self):
 
             logging.info(f"infer.py process exited with return_code={return_code}")
             self.log_process_diagnostics(full=True)
-            self.stop_process(is_monitor_thread=True)
-            return
+            break
+
+        self.restart_count += 1
+        if self.restart_count > 10:
+            logging.error("infer.py process has restarted more than 10 times. Exiting.")
+            os._exit(1)
+
+        # Start a separate thread to restart the process since it will
+        # restart the monitor thread itself (the current thread).
+        def restart_process():
+            try:
+                logging.info(f"Restarting infer.py process restart_count={self.restart_count}")
+                self.stop_process()
+                self.start_process()
+            except Exception as e:
+                logging.error(f"Error restarting infer.py process: {e}")
+                os._exit(1)
+        threading.Thread(target=restart_process).start()
 
-    def stop_process(self, is_monitor_thread: bool = False):
+    def stop_process(self):
         if self.process:
+            if self.process.stdout:
+                # Closing the output stream sometimes hangs, so we do it in a separate daemon thread
+                # and join the log_thread below with a timeout. If it does hang there might be a thread
+                # leak which is why we limit to up to 10 restarts.
+                stdout = self.process.stdout
+                threading.Thread(target=lambda: stdout.close(), daemon=True).start()
             self.process.terminate()
             try:
                 self.process.wait(timeout=10)
@@ -174,15 +185,14 @@ def stop_process(self, is_monitor_thread: bool = False):
                     logging.error(f"Error while force killing process: {e}")
                     os._exit(1)
             self.process = None
-        if self.monitor_thread and not is_monitor_thread:
+        if self.monitor_thread:
             self.monitor_thread.join()
             self.monitor_thread = None
-        if hasattr(self, 'stdout_log_thread') and self.stdout_log_thread:
-            self.stdout_log_thread.join()
-            self.stdout_log_thread = None
-        if hasattr(self, 'stderr_log_thread') and self.stderr_log_thread:
-            self.stderr_log_thread.join()
-            self.stderr_log_thread = None
+        if self.log_thread:
+            self.log_thread.join(timeout=1)
+            if self.log_thread.is_alive():
+                logging.warning("Log thread did not finish")
+            self.log_thread = None
         logging.info("Infer process stopped successfully")
 
 
@@ -252,7 +262,7 @@ def read_proc_as_map(path: str) -> dict | str:
                 with open(path, "r") as f:
                     return f.read()
 
-        os_proc_info = {}
+        os_proc_info: dict[str, str | dict] = {}
         for proc_file in ["status", "wchan", "io"]:
             try:
                 path = f"/proc/{pid}/{proc_file}"