[infra] Introduce a simple local-process worker pool manager (#33)

mtrofin · web-flow · commit 7630deaaa389 · 2022-07-11T08:13:21.000-07:00
[infra] Introduce a simple local-process worker pool manager

This is meant as an in-place replacement of the current functionality,
but using a stateful worker object abstraction analogous to what Dask
supports. For local workloads, this implementation is faster than Dask's
LocalCluster (which may be due to misconfiguration). Because it's very
simple and introduces no new dependencies, even if LocalCluster
performance could be improved, the implementation would still be useful
for debugging.

The worker abstraction allows implementers specify a list of methods
that should be executed promptly on the server side, analogous to Dask's
`separate_thread=False` concept. For our purposes, we can use it to
implement full cancelation of work on the server side.

This patch introduces the worker manager, a subsequent patch will enable
its use in the rest of the codebase.
diff --git a/compiler_opt/distributed/__init__.py b/compiler_opt/distributed/__init__.py
@@ -0,0 +1,14 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/compiler_opt/distributed/local/__init__.py b/compiler_opt/distributed/local/__init__.py
@@ -0,0 +1,14 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/compiler_opt/distributed/local/local_worker_manager.py b/compiler_opt/distributed/local/local_worker_manager.py
@@ -0,0 +1,230 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Local Process Pool - based middleware implementation.
+
+This is a simple implementation of a worker pool, running on the local machine.
+Each worker object is hosted by a separate process. Each worker object may
+handle a number of concurrent requests. The client is given a stub object that
+exposes the same methods as the worker, just that they return Futures.
+
+There is a pair of queues between a stub and its corresponding process/worker.
+One queue is used to place tasks (method calls), the other to receive results.
+Tasks and results are correlated by a monotonically incrementing counter
+maintained by the stub.
+
+The worker process dequeues tasks promptly and either re-enqueues them to a
+local thread pool, or, if the task is 'urgent', it executes it promptly.
+"""
+import concurrent.futures
+import dataclasses
+import functools
+import multiprocessing
+import multiprocessing.connection
+import queue  # pylint: disable=unused-import
+import threading
+
+from absl import logging
+# pylint: disable=unused-import
+from compiler_opt.distributed.worker import Worker
+
+from contextlib import AbstractContextManager
+from typing import Any, Callable, Dict, Optional
+
+
+@dataclasses.dataclass(frozen=True)
+class Task:
+  msgid: int
+  func_name: str
+  args: tuple
+  kwargs: dict
+  is_urgent: bool
+
+
+@dataclasses.dataclass(frozen=True)
+class TaskResult:
+  msgid: int
+  success: bool
+  value: Any
+
+
+def _run_impl(in_q: 'queue.Queue[Task]', out_q: 'queue.Queue[TaskResult]',
+              worker_class: 'type[Worker]', *args, **kwargs):
+  """Worker process entrypoint."""
+  # Note: the out_q is typed as taking only TaskResult objects, not
+  # Optional[TaskResult], despite that being the type it is used on the Stub
+  # side. This is because the `None` value is only injected by the Stub itself.
+  pool = concurrent.futures.ThreadPoolExecutor()
+  obj = worker_class(*args, **kwargs)
+
+  def make_ondone(msgid):
+
+    def on_done(f: concurrent.futures.Future):
+      if f.exception():
+        out_q.put(TaskResult(msgid=msgid, success=False, value=f.exception()))
+      else:
+        out_q.put(TaskResult(msgid=msgid, success=True, value=f.result()))
+
+    return on_done
+
+  # Run forever. The stub will just kill the runner when done.
+  while True:
+    task = in_q.get()
+    the_func = getattr(obj, task.func_name)
+    application = functools.partial(the_func, *task.args, **task.kwargs)
+    if task.is_urgent:
+      try:
+        res = application()
+        out_q.put(TaskResult(msgid=task.msgid, success=True, value=res))
+      except BaseException as e:  # pylint: disable=broad-except
+        out_q.put(TaskResult(msgid=task.msgid, success=False, value=e))
+    else:
+      pool.submit(application).add_done_callback(make_ondone(task.msgid))
+
+
+def _run(*args, **kwargs):
+  try:
+    _run_impl(*args, **kwargs)
+  except BaseException as e:
+    logging.error(e)
+    raise e
+
+
+def _make_stub(cls: 'type[Worker]', *args, **kwargs):
+
+  class _Stub():
+    """Client stub to a worker hosted by a process."""
+
+    def __init__(self):
+      self._send: 'queue.Queue[Task]' = multiprocessing.get_context().Queue()
+      self._receive: 'queue.Queue[Optional[TaskResult]]' = \
+        multiprocessing.get_context().Queue()
+
+      # this is the process hosting one worker instance.
+      self._process = multiprocessing.Process(
+          target=functools.partial(
+              _run,
+              worker_class=cls,
+              in_q=self._send,
+              out_q=self._receive,
+              *args,
+              **kwargs))
+      # lock for the msgid -> reply future map. The map will be set to None
+      # when we stop.
+      self._lock = threading.Lock()
+      self._map: Dict[int, concurrent.futures.Future] = {}
+
+      # thread drainig the receive queue
+      self._pump = threading.Thread(target=self._msg_pump)
+      def observer():
+        self._process.join()
+        self._receive.put(None)
+      self._observer = threading.Thread(target=observer)
+
+      # atomic control to _msgid
+      self._msgidlock = threading.Lock()
+      self._msgid = 0
+
+      # start the worker and the message pump
+      self._process.start()
+      # the observer must follow the process start, otherwise join() raises.
+      self._observer.start()
+      self._pump.start()
+
+    def _msg_pump(self):
+      while True:
+        task_result = self._receive.get()
+        if task_result is None:
+          break
+        with self._lock:
+          future = self._map[task_result.msgid]
+          del self._map[task_result.msgid]
+          if task_result.success:
+            future.set_result(task_result.value)
+          else:
+            future.set_exception(task_result.value)
+
+      # clear out pending futures and mark ourselves as "stopped" by null-ing
+      # the map
+      with self._lock:
+        for _, v in self._map.items():
+          v.set_exception(concurrent.futures.CancelledError())
+        self._map = None
+
+    def _is_stopped(self):
+      return self._map is None
+
+    def __getattr__(self, name) -> Callable[[Any], concurrent.futures.Future]:
+      result_future = concurrent.futures.Future()
+
+      with self._msgidlock:
+        msgid = self._msgid
+        self._msgid += 1
+
+      def remote_call(*args, **kwargs):
+        with self._lock:
+          if self._is_stopped():
+            result_future.set_exception(concurrent.futures.CancelledError())
+          else:
+            self._send.put(
+                Task(
+                    msgid=msgid,
+                    func_name=name,
+                    args=args,
+                    kwargs=kwargs,
+                    is_urgent=cls.is_priority_method(name)))
+            self._map[msgid] = result_future
+        return result_future
+
+      return remote_call
+
+    def shutdown(self):
+      try:
+        self._process.kill()
+      except:  # pylint: disable=bare-except
+        pass
+
+    def join(self):
+      self._observer.join()
+      self._pump.join()
+      self._process.join()
+
+    def __dir__(self):
+      return [n for n in dir(cls) if not n.startswith('_')]
+
+  return _Stub()
+
+
+class LocalWorkerPool(AbstractContextManager):
+  """A pool of workers hosted on the local machines, each in its own process."""
+
+  def __init__(self, worker_class: 'type[Worker]', count: Optional[int], *args,
+               **kwargs):
+    if not count:
+      count = multiprocessing.cpu_count()
+    self._stubs = [
+        _make_stub(worker_class, *args, **kwargs) for _ in range(count)
+    ]
+
+  def __enter__(self):
+    return self._stubs
+
+  def __exit__(self, *args, **kwargs):
+    # first, trigger killing the worker process and exiting of the msg pump,
+    # which will also clear out any pending futures.
+    for s in self._stubs:
+      s.shutdown()
+    # now wait for the message pumps to indicate they exit.
+    for s in self._stubs:
+      s.join()
diff --git a/compiler_opt/distributed/local/local_worker_manager_test.py b/compiler_opt/distributed/local/local_worker_manager_test.py
@@ -0,0 +1,103 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test for local worker manager."""
+
+import concurrent.futures
+import multiprocessing
+import time
+
+from absl.testing import absltest
+from compiler_opt.distributed.worker import Worker
+from compiler_opt.distributed.local import local_worker_manager
+from tf_agents.system import system_multiprocessing as multiprocessing
+
+
+class LocalWorkerManagerTest(absltest.TestCase):
+
+  def test_pool(self):
+
+    class Job(Worker):
+      """Test worker."""
+
+      def __init__(self):
+        self._token = 0
+
+      @classmethod
+      def is_priority_method(cls, method_name: str) -> bool:
+        return method_name == 'priority_method'
+
+      def priority_method(self):
+        return f'priority {self._token}'
+
+      def get_token(self):
+        return self._token
+
+      def set_token(self, value):
+        self._token = value
+
+    with local_worker_manager.LocalWorkerPool(Job, 2) as pool:
+      p1 = pool[0]
+      p2 = pool[1]
+      set_futures = [p1.set_token(1), p2.set_token(2)]
+      done, not_done = concurrent.futures.wait(set_futures)
+      self.assertLen(done, 2)
+      self.assertEmpty(not_done)
+      self.assertLen([f for f in done if not f.exception()], 2)
+      self.assertEqual(p1.get_token().result(), 1)
+      self.assertEqual(p2.get_token().result(), 2)
+      self.assertEqual(p1.priority_method().result(), 'priority 1')
+      self.assertEqual(p2.priority_method().result(), 'priority 2')
+      # wait - to make sure the pump doesn't panic if there's no new messages
+      time.sleep(3)
+      # everything still works
+      self.assertEqual(p2.get_token().result(), 2)
+
+  def test_failure(self):
+
+    class Job(Worker):
+
+      def __init__(self, wont_be_passed):
+        self._arg = wont_be_passed
+
+      def method(self):
+        return self._arg
+
+    with local_worker_manager.LocalWorkerPool(Job, 2) as pool:
+      with self.assertRaises(concurrent.futures.CancelledError):
+        # this will fail because we didn't pass the arg to the ctor, so the
+        # worker hosting process will crash.
+        pool[0].method().result()
+
+
+  def test_worker_crash_while_waiting(self):
+
+    class Job(Worker):
+
+      def method(self):
+        time.sleep(3600)
+
+    with local_worker_manager.LocalWorkerPool(Job, 2) as pool:
+      p = pool[0]
+      f = p.method()
+      self.assertFalse(f.done())
+      try:
+        p._process.kill()  # pylint: disable=protected-access
+      finally:
+        with self.assertRaises(concurrent.futures.CancelledError):
+          _ = f.result()
+
+
+if __name__ == '__main__':
+  multiprocessing.handle_test_main(absltest.main)
diff --git a/compiler_opt/distributed/worker.py b/compiler_opt/distributed/worker.py
@@ -0,0 +1,23 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Common abstraction for a worker contract."""
+
+
+class Worker:
+
+  @classmethod
+  def is_priority_method(cls, method_name: str) -> bool:
+    _ = method_name
+    return False