Add option to generate_default_trace to output keys (#477)

boomanaiden154 · web-flow · commit 087d44664351 · 2025-03-28T21:37:31.000-07:00
This patch adds an option to generate_default_trace to output the keys
associated with the examples. This is primarily intended for use in
extracting functions that contain eviction decisions to improve the
efficiency of the new training workflow for ES regalloc trace modelling.
However, this might also be useful in other circumstances if more
introspection into the data is needed.
diff --git a/compiler_opt/tools/generate_default_trace.py b/compiler_opt/tools/generate_default_trace.py
@@ -64,6 +64,9 @@
 _GIN_BINDINGS = flags.DEFINE_multi_string(
     'gin_bindings', [],
     'Gin bindings to override the values set in the config files.')
+_KEYS_FILE = flags.DEFINE_string(
+    'keys_file', None,
+    'The path to the file to write out the keys encountered.')
 
 
 class FilteringWorker(worker.Worker):
@@ -86,24 +89,28 @@ def __init__(self, policy_path: str | None, key_filter: str | None,
 
   def compile_and_filter(
       self, loaded_module_spec: corpus.LoadedModuleSpec
-  ) -> tuple[str, list[str], dict[str, compilation_runner.RewardStat]]:
+  ) -> tuple[str, list[str], dict[str, compilation_runner.RewardStat],
+             list[str]]:
     data = self._runner.collect_data(
         loaded_module_spec=loaded_module_spec,
         policy=self._policy,
         reward_stat=None,
         model_id=0)
     if self._key_filter is None:
       return (loaded_module_spec.name, data.serialized_sequence_examples,
-              data.reward_stats)
+              data.reward_stats, data.keys)
     new_reward_stats = {}
     new_sequence_examples = []
+    new_keys = []
     for k, sequence_example in zip(data.keys,
                                    data.serialized_sequence_examples):
       if not self._key_filter.match(k):
         continue
       new_reward_stats[k] = data.reward_stats[k]
       new_sequence_examples.append(sequence_example)
-    return (loaded_module_spec.name, new_sequence_examples, new_reward_stats)
+      new_keys.append(k)
+    return (loaded_module_spec.name, new_sequence_examples, new_reward_stats,
+            new_keys)
 
 
 def main(_):
@@ -147,6 +154,7 @@ def generate_trace(worker_manager_class: type[
   work = [
       cps.load_module_spec(corpus_element) for corpus_element in corpus_elements
   ]
+  all_keys = []
 
   runner_type = config.get_runner_type()
   with tfrecord_context as tfrecord_writer:
@@ -178,7 +186,8 @@ def generate_trace(worker_manager_class: type[
           total_successful_examples += len(succeeded)
           total_failed_examples += (len(done) - len(succeeded))
           for r in succeeded:
-            module_name, records, reward_stat = r.result()
+            module_name, records, reward_stat, keys = r.result()
+            all_keys.extend(keys)
             if tfrecord_writer:
               total_training_examples += len(records)
               for r in records:
@@ -196,6 +205,10 @@ def generate_trace(worker_manager_class: type[
         f'succeeded, and {total_training_examples} trainining examples '
         'written')
 
+  if _KEYS_FILE.value is not None:
+    with open(_KEYS_FILE.value, 'w', encoding='utf-8') as keys_file:
+      keys_file.write('\n'.join(str(key) for key in all_keys) + '\n')
+
 
 if __name__ == '__main__':
   flags.mark_flag_as_required('data_path')
diff --git a/compiler_opt/tools/generate_default_trace_test.py b/compiler_opt/tools/generate_default_trace_test.py
@@ -61,16 +61,17 @@ def collect_data(self,
     sequence_example = text_format.Parse(sequence_example_text,
                                          tf.train.SequenceExample())
 
+    key = f'key_{os.getpid()}'
     return compilation_runner.CompilationResult(
         sequence_examples=[sequence_example],
         reward_stats={
-            'default':
+            key:
                 compilation_runner.RewardStat(
                     default_reward=1, moving_average_reward=2)
         },
         rewards=[1.2],
         policy_rewards=[18],
-        keys=['default'],
+        keys=[key],
         model_id=model_id)
 
 
@@ -111,9 +112,16 @@ def test_generate_trace(self, mock_get_runner):
         output_path=os.path.join(tmp_dir.full_path, 'output'),
         output_performance_path=os.path.join(tmp_dir.full_path,
                                              'output_performance'),
-    ):
+        keys_file=os.path.join(tmp_dir.full_path, 'keys_file')):
       generate_default_trace.generate_trace()
 
+    with open(
+        os.path.join(tmp_dir.full_path, 'keys_file'),
+        encoding='utf-8') as keys_file:
+      keys = [key_line.strip() for key_line in keys_file.readlines()]
+      for key in keys:
+        self.assertStartsWith(key, 'key_')
+
 
 if __name__ == '__main__':
   multiprocessing.handle_main(absltest.main)