[Bugfix] set OMP_NUM_THREADS to 1 by default for multiprocessing (#6109)

tjohnson31415 · njhill · web-flow · commit 1dab9bc8a919 · 2024-07-03T16:56:59.000-07:00
Signed-off-by: Travis Johnson &lt;tsjohnso@us.ibm.com&gt;
Co-authored-by: Nick Hill &lt;nickhill@us.ibm.com&gt;
diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
@@ -37,6 +37,11 @@ def _init_executor(self) -> None:
         # Disable torch async compiling which won't work with daemonic processes
         os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 
+        # Set OMP_NUM_THREADS to 1 if it is not set explicitly, avoids CPU
+        # contention amongst the shards
+        if "OMP_NUM_THREADS" not in os.environ:
+            os.environ["OMP_NUM_THREADS"] = "1"
+
         assert world_size <= cuda_device_count_stateless(), (
             "please set tensor_parallel_size to less than max local gpu count")