check_env in multiprocess (#3879)

grimoire · web-flow · commit f710f5168dba · 2025-08-27T18:14:39.000+08:00
diff --git a/lmdeploy/pytorch/engine/engine_checker.py b/lmdeploy/pytorch/engine/engine_checker.py
@@ -96,3 +96,27 @@ def check(self):
                               message='num_gpu_blocks should be greater than 16, '
                               f'but got {num_gpu_blocks}. Set num_gpu_blocks to 0 to automatically '
                               'determine the number of GPU blocks based on the model size and device memory.')
+
+    def _handle_impl(self):
+        return super().handle()
+
+    def handle(self):
+        import multiprocessing as mp
+        from concurrent.futures import ProcessPoolExecutor
+
+        from lmdeploy.pytorch import envs
+        if not envs.enable_check_env:
+            return
+
+        current_proc = mp.current_process()
+        if not current_proc.daemon:
+            mp_ctx = mp.get_context('spawn')
+            with ProcessPoolExecutor(mp_context=mp_ctx) as executor:
+                try:
+                    executor.submit(self._handle_impl).result()
+                except SystemExit:
+                    exit(1)
+                except BaseException as e:
+                    self.log_and_exit(e, mod_name='Engine')
+        else:
+            return self._handle_impl()
diff --git a/lmdeploy/pytorch/engine/mp_engine/zmq_engine.py b/lmdeploy/pytorch/engine/mp_engine/zmq_engine.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import asyncio
+import atexit
 import pickle
 import signal
 from typing import TYPE_CHECKING
@@ -40,6 +41,7 @@ def __init__(self, model_path: str, tokenizer: object, engine_config: PytorchEng
         self.rpc_client = AsyncRPCClient(port=self.port)
 
         super().__init__()
+        atexit.register(self.close)
 
     def _start_mp_proc(self, model_path: str, tokenizer: object, engine_config: PytorchEngineConfig = None):
         """Start mp proc."""
@@ -54,16 +56,17 @@ def _start_mp_proc(self, model_path: str, tokenizer: object, engine_config: Pyto
             condition = manager.Condition()
             self.mp_ctx = mp.get_context('spawn')
             log_level = logger.level
-            self.proc = self.mp_ctx.Process(target=self._mp_proc,
-                                            args=(self.shared_dict, condition),
-                                            kwargs=(dict(
-                                                model_path=model_path,
-                                                tokenizer=tokenizer,
-                                                engine_config=engine_config,
-                                                log_level=log_level,
-                                            )),
-                                            name='mp_engine_proc',
-                                            daemon=True)
+            self.proc = self.mp_ctx.Process(
+                target=self._mp_proc,
+                args=(self.shared_dict, condition),
+                kwargs=(dict(
+                    model_path=model_path,
+                    tokenizer=tokenizer,
+                    engine_config=engine_config,
+                    log_level=log_level,
+                )),
+                name='mp_engine_proc',
+            )
             self.proc.start()
             logger.debug('Receiving rpc server port from mp process.')
             with condition:
@@ -156,10 +159,13 @@ async def _collective_rpc_streaming_async(self, func, *args, **kwargs):
 
     def close(self) -> None:
         """Close mp engine."""
+        if self.proc is None:
+            return
         logger.info('Closing mp engine.')
         self.rpc_client.stop()
         self.proc.terminate()
         self.proc.join(10)
+        self.proc = None
 
     def start_loop(self) -> None:
         """Start mp engine loop."""
diff --git a/lmdeploy/pytorch/envs.py b/lmdeploy/pytorch/envs.py
@@ -118,6 +118,9 @@ def _patched_get_env(
     # logging
     log_file = os.getenv('LMDEPLOY_LOG_FILE', None)
 
+    # check env
+    enable_check_env = env_to_bool('LMDEPLOY_ENABLE_CHECK_ENV', True)
+
     # dlblas
     # we don't need to read this, it would be passed to ray workers
     # If Ray is launched from outside, it may fail to access the environment variables.