Compute relative reward in trace blackbox evaluator

boomanaiden154 · web-flow · commit c9ebbf26ce29 · 2025-03-08T22:41:15.000-08:00
This patch makes the trace blackbox evaluator return a relative reward rather than the raw reward. This makes the rewards actually meaningful, and also prevents overflowing calculatings in blackbox_optimizer which previously made the new model weights NaN. Reviewers: mtrofin Reviewed By: mtrofin Pull Request: #463
diff --git a/compiler_opt/es/blackbox_evaluator.py b/compiler_opt/es/blackbox_evaluator.py
@@ -23,6 +23,7 @@
 from compiler_opt.rl import corpus
 from compiler_opt.es import blackbox_optimizers
 from compiler_opt.distributed import buffered_scheduler
+from compiler_opt.rl import compilation_runner
 
 
 class BlackboxEvaluator(metaclass=abc.ABCMeta):
@@ -159,3 +160,16 @@ def set_baseline(self, pool: FixedWorkerPool) -> None:
                        f' got {len(futures)}')
 
     self._baseline = futures[0].result()
+
+  def get_rewards(
+      self, results: list[concurrent.futures.Future]) -> list[float | None]:
+    rewards = []
+
+    for result in results:
+      if result.exception() is not None:
+        raise result.exception()
+
+      rewards.append(
+          compilation_runner.calculate_reward(result.result(), self._baseline))
+
+    return rewards
diff --git a/compiler_opt/es/blackbox_evaluator_test.py b/compiler_opt/es/blackbox_evaluator_test.py
@@ -88,3 +88,25 @@ def test_trace_set_baseline(self):
       evaluator.set_baseline(pool)
       # pylint: disable=protected-access
       self.assertAlmostEqual(evaluator._baseline, 10)
+
+  def test_trace_get_rewards(self):
+    f1 = concurrent.futures.Future()
+    f1.set_result(2)
+    f2 = concurrent.futures.Future()
+    f2.set_result(3)
+    results = [f1, f2]
+    test_corpus = corpus.create_corpus_for_testing(
+        location=self.create_tempdir().full_path,
+        elements=[corpus.ModuleSpec(name='name1', size=1)])
+    evaluator = blackbox_evaluator.TraceBlackboxEvaluator(
+        test_corpus, blackbox_optimizers.EstimatorType.FORWARD_FD,
+        'fake_bb_trace_path', 'fake_function_index_path')
+
+    # pylint: disable=protected-access
+    evaluator._baseline = 2
+    rewards = evaluator.get_rewards(results)
+
+    # Only check for two decimal places as the reward calculation uses a
+    # reasonably large delta (0.01) when calculating the difference to
+    # prevent division by zero.
+    self.assertSequenceAlmostEqual(rewards, [0, -0.5], 2)
diff --git a/compiler_opt/es/blackbox_learner.py b/compiler_opt/es/blackbox_learner.py
@@ -230,6 +230,9 @@ def _save_model(self) -> None:
   def get_model_weights(self) -> npt.NDArray[np.float32]:
     return self._model_weights
 
+  def set_baseline(self, pool: FixedWorkerPool) -> None:
+    self._evaluator.set_baseline(pool)
+
   def run_step(self, pool: FixedWorkerPool) -> None:
     """Run a single step of blackbox learning.
     This does not instantaneously return due to several I/O
@@ -245,12 +248,8 @@ def run_step(self, pool: FixedWorkerPool) -> None:
           p for p in initial_perturbations for p in (p, -p)
       ]
 
-    # TODO(boomanaiden154): This should be adding the perturbation to
-    # the existing model weights. That currently results in the model
-    # weights all being NaN, presumably due to rewards not being scaled for
-    # the regalloc_trace problem.
     perturbations_as_bytes = [
-        perturbation.astype(np.float32).tobytes()
+        (self._model_weights + perturbation).astype(np.float32).tobytes()
         for perturbation in initial_perturbations
     ]
 
diff --git a/compiler_opt/es/es_trainer_lib.py b/compiler_opt/es/es_trainer_lib.py
@@ -215,6 +215,7 @@ def train(additional_compilation_flags=(),
       worker_class,
       count=learner_config.total_num_perturbations,
       worker_kwargs=dict(gin_config=gin.operative_config_str())) as pool:
+    learner.set_baseline(pool)
     for _ in range(learner_config.total_steps):
       learner.run_step(pool)