Async go explore (#244)

ldoshi · web-flow · commit 019992d1ecec · 2025-04-10T21:54:12.000-07:00
make rollout workers run async relative to the cache updates in the main thread to increase throughput of rollouts.
diff --git a/bridger/config/training.py b/bridger/config/training.py
@@ -109,9 +109,9 @@
 help_str = "The number of actions to take in an exploration rollout for go-explore."
 hparam_dict[key]["help"] = help_str
 
-key = "go_explore_num_iterations"
+key = "go_explore_num_tasks"
 hparam_dict[key] = {"type": int, "default": 8}
-help_str = "The number of iterations of exploration rollouts for go-explore."
+help_str = "The number of exploration rollout tasks to assign for go-explore. This is effectively an iteration count."
 hparam_dict[key]["help"] = help_str
 
 key = "go_explore_epsilon_1"
@@ -159,9 +159,9 @@
 help_str = "The y stride to downsample the state."
 hparam_dict[key]["help"] = help_str
 
-key = "go_explore_num_samples_per_iteration"
-hparam_dict[key] = {"type": int, "default": 1200}
-help_str = "The number of processes to use for rollout."
+key = "go_explore_num_samples_per_worker_task"
+hparam_dict[key] = {"type": int, "default": 10}
+help_str = "The number of samples to send each worker as a task"
 hparam_dict[key]["help"] = help_str
 
 key = "jitter"
diff --git a/bridger/go_explore_phase_1.py b/bridger/go_explore_phase_1.py
@@ -28,8 +28,6 @@
     ],
 )
 
-_WORK_PER_CHUNK = 10
-
 
 def _count_score(
     v: float, wa: float, pa: float, epsilon_1: float, epsilon_2: float
@@ -369,22 +367,37 @@ def rollout(
     return success_entries, state_sampler_cache_update
 
 
+def rollout_worker(task_queue, result_queue, rollout_func):
+    while True:
+        task = task_queue.get()
+        if task is None:
+            task_queue.task_done()
+            break
+
+        result = rollout_func(*task)
+        result_queue.put(result)
+        task_queue.task_done()
+    return None
+
+
 def explore(
     object_logger: ObjectLogManager,
     hparams: Any,
 ) -> set[SuccessEntry]:
-    """
-    Generate success entries by performing exploration in the environment.
+    """Generate success entries by performing exploration in the environment.
 
     Uses multiple processes to collect rollouts and update the state cache.
 
-    This function runs a specified number of iterations, where in each iteration, it samples
-    start states and entries from the cache, generates random seeds for each process, and
-    collects rollouts in parallel using multiprocessing. The collected rollouts are then used
-    to update the cache and accumulate successful entries.
+    This function generates a specified number of tasks to send to
+    rollout workers. For each task, it samples start states and
+    entries from the cache, generates random seeds for each process,
+    and collects rollouts in parallel using multiprocessing. The
+    collected rollouts are then used to update the cache and
+    accumulate successful entries.
 
     Returns:
         set[SuccessEntry]: A set of generated success entries.
+
     """
     rng = np.random.default_rng(hparams.seed)
 
@@ -410,40 +423,44 @@ def explore(
     state_sampler.update(state_sampler_cache_update)
 
     success_entries: set[SuccessEntry] = set()
-    for iteration in range(hparams.go_explore_num_iterations):
-        if iteration + 1 % 20 == 0:
-            print(
-                f"[Iteration {iteration}] Successes: {len(success_entries)} ({sorted([len(x.trajectory) for x in success_entries ])})"
-            )
-            x = next(sorted(success_entries, key=lambda e: len(e.trajectory)))
-            print("  Trajectory: ", x.trajectory)
 
+    def _get_tasks(total_task_count, new_task_count, current_best_trajectory_length):
         start_entries = state_sampler.sample(
-            n=hparams.go_explore_num_samples_per_iteration
+            n=new_task_count * hparams.go_explore_num_samples_per_worker_task
         )
         if hparams.debug:
             object_logger.log(
-                "start_entries-width-{hparams.env_width}.pkl",
-                OccurrenceLogEntry(batch_idx=iteration, object=start_entries),
+                f"start_entries-width-{hparams.env_width}.pkl",
+                OccurrenceLogEntry(batch_idx=total_task_count, object=start_entries),
             )
 
         seeds = rng.integers(low=0, high=2**31, size=len(start_entries))
         rngs = list(map(np.random.default_rng, seeds))
 
-        start_entries_chunked = chunked(start_entries, _WORK_PER_CHUNK)
-        rngs_chunked = chunked(rngs, _WORK_PER_CHUNK)
-
-        _collect_rollouts = functools.partial(
-            rollout,
-            rollout_params,
-            state_sampler.current_best_trajectory_length,
+        start_entries_chunked = chunked(
+            start_entries, hparams.go_explore_num_samples_per_worker_task
         )
+        rngs_chunked = chunked(rngs, hparams.go_explore_num_samples_per_worker_task)
+        return [
+            (current_best_trajectory_length, start_entries_chunk, rngs_chunk)
+            for start_entries_chunk, rngs_chunk in zip(
+                start_entries_chunked, rngs_chunked
+            )
+        ]
+
+    def _process_results(count: int, final_flush: bool = False):
+        # Block on the first, then process up to count in total.
+        try:
+            for i in range(count):
+                if i == 0 or final_flush:
+                    rollout_success_entries, state_sampler_cache_update = (
+                        result_queue.get()
+                    )
+                else:
+                    rollout_success_entries, state_sampler_cache_update = (
+                        result_queue.get_nowait()
+                    )
 
-        with multiprocessing.Pool(processes=hparams.go_explore_num_processes) as pool:
-            for rollout_success_entries, state_sampler_cache_update in pool.starmap(
-                _collect_rollouts,
-                [*zip(start_entries_chunked, rngs_chunked)],
-            ):
                 # Compile success entries from the current set of
                 # rollouts to build out the return value for this
                 # function.
@@ -452,6 +469,67 @@ def explore(
                 # iteration of exploratory rollouts.
                 state_sampler.update(state_sampler_cache_update)
 
+                result_queue.task_done()
+        except:
+            pass
+
+    # Push initial tasks.
+    task_target = hparams.go_explore_num_processes * 2
+    task_queue = multiprocessing.JoinableQueue(maxsize=task_target)
+    result_queue = multiprocessing.JoinableQueue()
+
+    # Start worker processes.
+    _collect_rollouts = functools.partial(rollout, rollout_params)
+    workers = []
+    for _ in range(hparams.go_explore_num_processes):
+        workers.append(
+            multiprocessing.Process(
+                target=rollout_worker,
+                args=(task_queue, result_queue, _collect_rollouts),
+            )
+        )
+    for worker in workers:
+        worker.start()
+
+    total_task_count = 0
+    while True:
+        if total_task_count + 1 % 20 == 0:
+            print(
+                f"[Total Task Count {total_task_count}] Successes: {len(success_entries)} ({sorted([len(x.trajectory) for x in success_entries ])})"
+            )
+            x = next(sorted(success_entries, key=lambda e: len(e.trajectory)))
+            print("  Trajectory: ", x.trajectory)
+
+        if total_task_count == hparams.go_explore_num_tasks:
+            # Clean up by posting sentinels.
+            for _ in range(len(workers)):
+                task_queue.put(None)
+
+            task_queue.join()
+
+            # Process all completed tasks.
+            _process_results(result_queue.qsize(), final_flush=True)
+            result_queue.join()
+
+            for worker in workers:
+                worker.join()
+
+            break
+
+        for task in _get_tasks(
+            total_task_count=total_task_count,
+            new_task_count=task_target - task_queue.qsize(),
+            current_best_trajectory_length=state_sampler.current_best_trajectory_length,
+        ):
+            task_queue.put(task)
+            total_task_count += 1
+            if total_task_count == hparams.go_explore_num_tasks:
+                break
+
+        # Process up to len(workers) results to balance batching and
+        # not being stuck until all the work-in-flight is done.
+        _process_results(len(workers))
+
     if hparams.debug:
         object_logger.log(
             f"state_cache-width-{hparams.env_width}.pkl",
@@ -480,7 +558,7 @@ def explore(
 
         for success_entry in success_entries:
             object_logger.log(
-                "success_entry-width-{hparams.env_width}.pkl", success_entry
+                f"success_entry-width-{hparams.env_width}.pkl", success_entry
             )
 
         print(