Add load balancer (google#106)

Northbadge · web-flow · commit 3a2c99592a7c · 2022-08-17T20:30:52.000-07:00
Adds a buffered load balancer, which by default maintains at least 2 tasks assigned to each worker. Closes google#91
diff --git a/compiler_opt/distributed/local/buffered_scheduler.py b/compiler_opt/distributed/local/buffered_scheduler.py
@@ -0,0 +1,82 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""An optimal push-pull-based load balancer which attempts to maintain at least
+`buffer` tasks assigned to each worker.
+"""
+
+import concurrent.futures
+import threading
+
+from typing import List, Callable, TypeVar
+
+from compiler_opt.distributed import worker
+
+T = TypeVar('T')
+
+
+def schedule(work: List[Callable[[T], worker.WorkerFuture]],
+             workers: List[T],
+             buffer=2) -> List[worker.WorkerFuture]:
+  """
+  Assigns work to workers once previous work of the worker are
+  completed.
+  Args:
+    work: Function to call with a worker.
+    workers: List of workers that are the singular argument to callable.
+    buffer: Number of work to maintain on each worker.
+  Returns:
+    A list of Futures.
+  """
+  # Create futures to be returned first, these futures aren't bound to
+  # anything now, but they will be later.
+  results = [concurrent.futures.Future() for _ in range(len(work))]
+  idx = -1
+  idx_lock = threading.Lock()
+
+  # Simple atomic increment and get.
+  # Used to iterate over `work` like a thread-safe queue without making a copy.
+  def fetch_idx():
+    nonlocal idx
+    with idx_lock:
+      idx += 1
+      return idx
+
+  def make_result_handler(wkr: T, result_future: concurrent.futures.Future):
+
+    def handler(worker_future: concurrent.futures.Future):
+      if (e := worker_future.exception()) is not None:
+        result_future.set_exception(e)
+      else:
+        result_future.set_result(worker_future.result())
+      chain_work(wkr)
+
+    return handler
+
+  def chain_work(wkr: T):
+    if (i := fetch_idx()) < len(work):
+      # This potentially causes a deadlock if chain_work is called via a
+      # future.set_result() context which holds a resource that is also required
+      # to complete the call work[i](wkr) call below. For an example, see:
+      # https://gist.github.com/Northbadge/a57f2d4e0a71e8f3934bdb47e59e343e
+      # A fix/workaround would be using threading below, but that introduces
+      # overhead of creating a new thread.
+      work[i](wkr).add_done_callback(make_result_handler(wkr, results[i]))
+
+  # Use min() in case buffer is huge for some reason.
+  for _ in range(min(buffer, (len(work) // len(workers)) + 1)):
+    for w in workers:
+      chain_work(w)
+
+  return results
diff --git a/compiler_opt/distributed/local/buffered_scheduler_test.py b/compiler_opt/distributed/local/buffered_scheduler_test.py
@@ -0,0 +1,96 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test for buffered_scheduler."""
+
+import concurrent.futures
+import threading
+import time
+
+from absl.testing import absltest
+from compiler_opt.distributed import worker
+from compiler_opt.distributed.local import buffered_scheduler
+
+
+class BufferedSchedulerTest(absltest.TestCase):
+
+  def test_schedules(self):
+    call_count = [0] * 4
+    locks = [threading.Lock() for _ in range(4)]
+
+    def wkr_factory(i):
+
+      def wkr():
+        with locks[i]:
+          call_count[i] += 1
+
+      return wkr
+
+    wkrs = [wkr_factory(i) for i in range(4)]
+
+    def job(wkr):
+      future = concurrent.futures.Future()
+
+      def task():
+        wkr()
+        future.set_result(0)
+
+      threading.Timer(interval=0.10, function=task).start()
+      return future
+
+    work = [job] * 20
+
+    worker.wait_for(buffered_scheduler.schedule(work, wkrs))
+    self.assertEqual(sum(call_count), 20)
+
+  def test_balances(self):
+    call_count = [0] * 4
+    locks = [threading.Lock() for _ in range(4)]
+
+    def wkr_factory(i):
+
+      def wkr():
+        with locks[i]:
+          call_count[i] += 1
+
+      return wkr
+
+    def slow_wkr():
+      with locks[0]:
+        call_count[0] += 1
+      time.sleep(1)
+
+    wkrs = [slow_wkr] + [wkr_factory(i) for i in range(1, 4)]
+
+    def job(wkr):
+      future = concurrent.futures.Future()
+
+      def task():
+        wkr()
+        future.set_result(0)
+
+      threading.Timer(interval=0.10, function=task).start()
+      return future
+
+    work = [job] * 20
+
+    worker.wait_for(buffered_scheduler.schedule(work, wkrs, buffer=2))
+    self.assertEqual(sum(call_count), 20)
+    # since buffer=2, 2 tasks get assigned to the slow wkr, the rest
+    # should've been assigned elsewhere if load balancing works.
+    self.assertEqual(call_count[0], 2)
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/compiler_opt/distributed/local/local_worker_manager.py b/compiler_opt/distributed/local/local_worker_manager.py
@@ -161,6 +161,11 @@ def _msg_pump(self):
         with self._lock:
           future = self._map[task_result.msgid]
           del self._map[task_result.msgid]
+        # The following will trigger any callbacks defined on the future, as a
+        # direct function call. If those callbacks were set by the scheduler,
+        # it's important that self._lock isn't being held when they are being
+        # called, otherwise a deadlock could arise from __get_attr__ trying to
+        # acquire the lock.
         if task_result.success:
           future.set_result(task_result.value)
         else:
diff --git a/compiler_opt/distributed/worker.py b/compiler_opt/distributed/worker.py
@@ -14,8 +14,7 @@
 # limitations under the License.
 """Common abstraction for a worker contract."""
 
-import abc
-from typing import Generic, Iterable, Optional, TypeVar
+from typing import Iterable, Optional, TypeVar, Protocol
 
 
 class Worker:
@@ -30,16 +29,17 @@ def is_priority_method(cls, method_name: str) -> bool:
 
 
 # Dask's Futures are limited. This captures that.
-class WorkerFuture(Generic[T], metaclass=abc.ABCMeta):
+class WorkerFuture(Protocol[T]):
 
-  @abc.abstractmethod
   def result(self) -> T:
     raise NotImplementedError()
 
-  @abc.abstractmethod
   def done(self) -> bool:
     raise NotImplementedError()
 
+  def add_done_callback(self, fn) -> None:
+    raise NotImplementedError
+
 
 def wait_for(futures: Iterable[WorkerFuture]):
   """Dask futures don't support more than result() and done()."""
diff --git a/compiler_opt/rl/local_data_collector.py b/compiler_opt/rl/local_data_collector.py
@@ -23,6 +23,7 @@
 from tf_agents.trajectories import trajectory
 
 from compiler_opt.distributed import worker
+from compiler_opt.distributed.local import buffered_scheduler
 from compiler_opt.rl import compilation_runner
 from compiler_opt.rl import corpus
 from compiler_opt.rl import data_collector
@@ -55,7 +56,7 @@ def __init__(
     # with the training phase - i.e. whatever happens between successive data
     # collection calls. Subsequent runs will wait for these to finish.
     self._reset_workers: Optional[concurrent.futures.Future] = None
-    self._current_work: List[Tuple[corpus.ModuleSpec, worker.WorkerFuture]] = []
+    self._current_futures: List[worker.WorkerFuture] = []
     self._pool = concurrent.futures.ThreadPoolExecutor()
 
   def close_pool(self):
@@ -85,12 +86,15 @@ def _schedule_jobs(
     jobs = [(module_spec, policy_path, self._reward_stat_map[module_spec.name])
             for module_spec in sampled_modules]
 
-    # TODO: Issue #91. Naive load balancing.
-    ret = []
-    for i in range(len(jobs)):
-      ret.append(self._worker_pool[i % len(self._worker_pool)].collect_data(
-          *(jobs[i])))
-    return ret
+    def work_factory(job):
+
+      def work(w):
+        return w.collect_data(*job)
+
+      return work
+
+    work = [work_factory(job) for job in jobs]
+    return buffered_scheduler.schedule(work, self._worker_pool, buffer=10)
 
   def collect_data(
       self, policy_path: str
@@ -108,22 +112,20 @@ def collect_data(
       information is viewable in TensorBoard.
     """
     sampled_modules = self._corpus.sample(k=self._num_modules, sort=False)
-    results = self._schedule_jobs(policy_path, sampled_modules)
+    self._current_futures = self._schedule_jobs(policy_path, sampled_modules)
 
     def wait_for_termination():
       early_exit = self._exit_checker_ctor(num_modules=self._num_modules)
 
       def get_num_finished_work():
-        finished_work = sum(res.done() for res in results)
+        finished_work = sum(res.done() for res in self._current_futures)
         return finished_work
 
       return early_exit.wait(get_num_finished_work)
 
     wait_seconds = wait_for_termination()
-    self._current_work = list(zip(sampled_modules, results))
-    finished_work = [
-        (spec, res) for spec, res in self._current_work if res.done()
-    ]
+    current_work = list(zip(sampled_modules, self._current_futures))
+    finished_work = [(spec, res) for spec, res in current_work if res.done()]
     successful_work = [(spec, res.result())
                        for spec, res in finished_work
                        if not worker.get_exception(res)]
@@ -139,7 +141,7 @@ def wrapup():
       # now that the workers killed pending compilations, make sure the workers
       # drained their working queues first - they should all complete quickly
       # since the cancellation manager is killing immediately any process starts
-      worker.wait_for(results)
+      worker.wait_for(self._current_futures)
       worker.wait_for([wkr.enable() for wkr in self._worker_pool])
 
     self._reset_workers = self._pool.submit(wrapup)
diff --git a/compiler_opt/rl/local_data_collector_test.py b/compiler_opt/rl/local_data_collector_test.py
@@ -188,7 +188,7 @@ def wait(self, _):
       collector.collect_data(policy_path='policy')
       collector._join_pending_jobs()
       killed = 0
-      for _, w in collector._current_work:
+      for w in collector._current_futures:
         self.assertRaises(compilation_runner.ProcessKilledError, w.result)
         killed += 1
       self.assertEqual(killed, 4)