python · oesteban · Nov 6, 2018 · Nov 9, 2018 · Nov 9, 2018 · Nov 9, 2018
@@ -2108,6 +2108,12 @@ with the :class:`Pool` class.
    .. versionadded:: 3.4
       *context*
 
+   .. versionchanged:: 3.8
+      When one of the worker processes terminates abruptly (e.g. the
+      Out Of Memory Killer of linux kicked in), a :exc:`BrokenProcessPool`
+      error is now raised. Previously, behavior was undefined and
+      the :class:`Pool` or its workers would often freeze or deadlock.
+
    .. note::
 
       Worker processes within a :class:`Pool` typically live for the complete
@@ -2225,6 +2231,12 @@ with the :class:`Pool` class.
       :ref:`typecontextmanager`.  :meth:`~contextmanager.__enter__` returns the
       pool object, and :meth:`~contextmanager.__exit__` calls :meth:`terminate`.
 
+   .. exception:: BrokenProcessPool
+
+      Derived from :exc:`RuntimeError`, this exception class is raised when
+      one of the workers of a :class:`Pool` has terminated in a non-clean
+      fashion (for example, if it was killed from the outside).
+
 
 .. class:: AsyncResult
 

@@ -33,19 +33,29 @@
 RUN = 0
 CLOSE = 1
 TERMINATE = 2
+BROKEN = 3
 
 #
 # Miscellaneous
 #
 
 job_counter = itertools.count()
 
+
 def mapstar(args):
     return list(map(*args))
 
+
 def starmapstar(args):
     return list(itertools.starmap(args[0], args[1]))
 
+
+class BrokenProcessPool(RuntimeError):
+    """
+    Raised when a process in a ProcessPoolExecutor terminated abruptly
+    while a future was in the running state.
+    """
+
 #
 # Hack to embed stringification of remote traceback in local traceback
 #
@@ -104,6 +114,7 @@ def worker(inqueue, outqueue, initializer=None, initargs=(), maxtasks=None,
     if initializer is not None:
         initializer(*initargs)
 
+    util.debug('worker started')
     completed = 0
     while maxtasks is None or (maxtasks and completed < maxtasks):
         try:
@@ -167,7 +178,7 @@ def __init__(self, processes=None, initializer=None, initargs=(),
         if processes is None:
             processes = os.cpu_count() or 1
         if processes < 1:
-            raise ValueError("Number of processes must be at least 1")
+            raise ValueError("Number of processes must be 2 or more")
 
         if initializer is not None and not callable(initializer):
             raise TypeError('initializer must be a callable')
@@ -220,20 +231,34 @@ def __init__(self, processes=None, initializer=None, initargs=(),
             exitpriority=15
             )
 
+
     @staticmethod
     def _join_exited_workers(pool):
         """Cleanup after any worker processes which have exited due to reaching
         their specified lifetime.  Returns True if any workers were cleaned up.
+        Returns None if the process pool is broken.
         """
         cleaned = False
-        for i in reversed(range(len(pool))):
-            worker = pool[i]
-            if worker.exitcode is not None:
+        broken = []
+        for i, p in reversed(list(enumerate(pool))):
+            broken.append(p.exitcode not in (None, 0))
+            if p.exitcode is not None:
                 # worker exited
                 util.debug('cleaning up worker %d' % i)
-                worker.join()
+                p.join()
                 cleaned = True
                 del pool[i]
+
+        if any(broken):
+            # Stop all workers
+            util.info('worker handler: process pool is broken, terminating workers...')
+            for p in pool:
+                if p.exitcode is None:
+                    p.terminate()
+            for p in pool:
+                p.join()
+            del pool[:]
+            return None
         return cleaned
 
     def _repopulate_pool(self):
@@ -271,11 +296,14 @@ def _maintain_pool(ctx, Process, processes, pool, inqueue, outqueue,
                        wrap_exception):
         """Clean up any exited workers and start replacements for them.
         """
-        if Pool._join_exited_workers(pool):
+        thread = threading.current_thread()
+        need_repopulate = Pool._join_exited_workers(pool)
+        if need_repopulate:
             Pool._repopulate_pool_static(ctx, Process, processes, pool,
                                          inqueue, outqueue, initializer,
                                          initargs, maxtasksperchild,
                                          wrap_exception)
+        return need_repopulate
 
     def _setup_queues(self):
         self._inqueue = self._ctx.SimpleQueue()
@@ -437,13 +465,24 @@ def _handle_workers(cache, taskqueue, ctx, Process, processes, pool,
                         inqueue, outqueue, initializer, initargs,
                         maxtasksperchild, wrap_exception):
         thread = threading.current_thread()
+        util.debug('worker handler entering')
 
         # Keep maintaining workers until the cache gets drained, unless the pool
         # is terminated.
         while thread._state == RUN or (cache and thread._state != TERMINATE):
-            Pool._maintain_pool(ctx, Process, processes, pool, inqueue,
-                               outqueue, initializer, initargs,
-                               maxtasksperchild, wrap_exception)
+            new_workers = Pool._maintain_pool(
+                ctx, Process, processes, pool, inqueue,
+                outqueue, initializer, initargs,
+                maxtasksperchild, wrap_exception)
+            if new_workers is None:
+                thread._state = BROKEN
+                for i, cache_ent in list(cache.items()):
+                    err = BrokenProcessPool(
+                        'A worker of the pool terminated abruptly '
+                        'while the child process was still executing.')
+                    # Exhaust MapResult with errors
+                    while cache_ent._number_left > 0:
+                        cache_ent._set(i, (False, err))
             time.sleep(0.1)
         # send sentinel to stop workers
         taskqueue.put(None)
@@ -452,6 +491,7 @@ def _handle_workers(cache, taskqueue, ctx, Process, processes, pool,
     @staticmethod
     def _handle_tasks(taskqueue, put, outqueue, pool, cache):
         thread = threading.current_thread()
+        util.debug('task handler entering')
 
         for taskseq, set_length in iter(taskqueue.get, None):
             task = None
@@ -497,6 +537,7 @@ def _handle_tasks(taskqueue, put, outqueue, pool, cache):
 
     @staticmethod
     def _handle_results(outqueue, get, cache):
+        util.debug('result handler entering')
         thread = threading.current_thread()
 
         while 1:
@@ -573,7 +614,9 @@ def close(self):
         util.debug('closing pool')
         if self._state == RUN:
             self._state = CLOSE
-            self._worker_handler._state = CLOSE
+            # Avert race condition in broken pools
+            if self._worker_handler._state != BROKEN:
+                self._worker_handler._state = CLOSE
 
     def terminate(self):
         util.debug('terminating pool')
@@ -606,13 +649,21 @@ def _help_stuff_finish(inqueue, task_handler, size):
     def _terminate_pool(cls, taskqueue, inqueue, outqueue, pool,
                         worker_handler, task_handler, result_handler, cache):
         # this is guaranteed to only be called once
-        util.debug('finalizing pool')
+        util.debug('terminate pool entering')
+        is_broken = BROKEN in (task_handler._state,
+                               worker_handler._state,
+                               result_handler._state)
 
         worker_handler._state = TERMINATE
         task_handler._state = TERMINATE
 
-        util.debug('helping task handler/workers to finish')
-        cls._help_stuff_finish(inqueue, task_handler, len(pool))
+        # Skip _help_finish_stuff if the pool is broken, because
+        # the broken process may have been holding the inqueue lock.
+        if not is_broken:
+            util.debug('helping task handler/workers to finish')
+            cls._help_stuff_finish(inqueue, task_handler, len(pool))
+        else:
+            util.debug('finalizing BROKEN procress pool')
 
         if (not result_handler.is_alive()) and (len(cache) != 0):
             raise AssertionError(
@@ -623,8 +674,8 @@ def _terminate_pool(cls, taskqueue, inqueue, outqueue, pool,
 
         # We must wait for the worker handler to exit before terminating
         # workers because we don't want workers to be restarted behind our back.
-        util.debug('joining worker handler')
         if threading.current_thread() is not worker_handler:
+            util.debug('joining worker handler')
             worker_handler.join()
 
         # Terminate workers which haven't already finished.
@@ -634,12 +685,12 @@ def _terminate_pool(cls, taskqueue, inqueue, outqueue, pool,
                 if p.exitcode is None:
                     p.terminate()
 
-        util.debug('joining task handler')
         if threading.current_thread() is not task_handler:
+            util.debug('joining task handler')
             task_handler.join()
 
-        util.debug('joining result handler')
         if threading.current_thread() is not result_handler:
+            util.debug('joining result handler')
             result_handler.join()
 
         if pool and hasattr(pool[0], 'terminate'):
@@ -649,6 +700,7 @@ def _terminate_pool(cls, taskqueue, inqueue, outqueue, pool,
                     # worker has not yet exited
                     util.debug('cleaning up worker %d' % p.pid)
                     p.join()
+        util.debug('terminate pool finalized')
 
     def __enter__(self):
         return self

@@ -2571,6 +2571,15 @@ def raising():
 def unpickleable_result():
     return lambda: 42
 
+def waiting(args):
+    time.sleep(7)
+
+def bad_exit(value):
+    if value:
+        from sys import exit
+        exit(123)
+
+
 class _TestPoolWorkerErrors(BaseTestCase):
     ALLOWED_TYPES = ('processes', )
 
@@ -2611,6 +2620,41 @@ def errback(exc):
         p.close()
         p.join()
 
+    def test_broken_process_pool1(self):
+        from multiprocessing.pool import BrokenProcessPool
+        p = multiprocessing.Pool(2)
+        res = p.map_async(waiting, range(10))
+        # Kill one of the pool workers.
+        waiting(None)
+        pid = p._pool[0].pid
+        os.kill(pid, signal.SIGTERM)
+        with self.assertRaises(BrokenProcessPool):
+            res.get()
+        p.close()
+        p.join()
+
+
+    def test_broken_process_pool2(self):
+        from multiprocessing.pool import BrokenProcessPool
+        p = multiprocessing.Pool(2)
+        res = p.map_async(waiting, [1])
+        # Kill one of the pool workers.
+        pid = p._pool[0].pid
+        os.kill(pid, signal.SIGTERM)
+        with self.assertRaises(BrokenProcessPool):
+            res.get()
+        p.close()
+        p.join()
+
+    def test_broken_process_pool3(self):
+        from multiprocessing.pool import BrokenProcessPool
+        p = multiprocessing.Pool(2)
+        with self.assertRaises(BrokenProcessPool):
+            res = p.map(bad_exit, [0, 0, 1, 0])
+        p.close()
+        p.join()
+
+
 class _TestPoolWorkerLifetime(BaseTestCase):
     ALLOWED_TYPES = ('processes', )
 

diff --git a/Misc/NEWS.d/next/Library/2018-11-09-13-02-35.bpo-22393.cqalgV.rst b/Misc/NEWS.d/next/Library/2018-11-09-13-02-35.bpo-22393.cqalgV.rst
@@ -0,0 +1,2 @@
+Fix ``multiprocessing.Pool`` indefintely hang when a worker process dies
+unexpectedly. Patch by Oscar Esteban, based on code from Dan O'Reilly.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Fix ``multiprocessing.Pool`` indefintely hang when a worker process dies
		unexpectedly. Patch by Oscar Esteban, based on code from Dan O'Reilly.