compilation_runner: Option to keep temporary files (#235)

MatzeB · web-flow · commit 6d9b2ae912d3 · 2023-05-09T14:41:00.000-07:00
This allows users to manually re-run commands after the fact to
investigate failures.
diff --git a/compiler_opt/rl/compilation_runner.py b/compiler_opt/rl/compilation_runner.py
@@ -39,6 +39,9 @@
     'Max duration (in seconds) after which we cancel any compilation job.')
 _QUIET = flags.DEFINE_bool(
     'quiet', True, 'Whether or not to compile quietly (hiding info logging)')
+_KEEP_TEMPS = flags.DEFINE_string(
+    'keep_temps', None,
+    'Put temporary files into given directory and keep them past exit.')
 
 
 def _calculate_reward(policy: float, baseline: float) -> float:
@@ -53,6 +56,30 @@ class RewardStat:
   moving_average_reward: float
 
 
+class NonTemporaryDirectory:
+  """Behaves like `tempfile.TemporaryDirectory` but does not clean up the
+  directory.  When python 3.12 is available this class can be replaced with
+  `TemporaryDirectory(..., delete=False)`"""
+
+  def __init__(
+      self,
+      suffix: Optional[str] = None,
+      prefix: Optional[str] = None,
+      dir: Optional[str] = None,  # pylint: disable=redefined-builtin
+      ignore_cleanup_errors: bool = False):
+    _ = ignore_cleanup_errors  # unused
+    self.name = tempfile.mkdtemp(suffix, prefix, dir)
+
+  def __repr__(self):
+    return f'<{self.__class__.__name__} {self.name!r}>'
+
+  def __enter__(self):
+    return self.name
+
+  def __exit__(self, exc, value, tb):
+    pass
+
+
 def _overwrite_trajectory_reward(sequence_example: tf.train.SequenceExample,
                                  reward: float) -> tf.train.SequenceExample:
   """Overwrite the reward in the trace (sequence_example) with the given one.
@@ -376,7 +403,12 @@ def collect_data(self,
       compilation_runner.ProcessKilledException is passed through.
       ValueError if example under default policy and ml policy does not match.
     """
-    with tempfile.TemporaryDirectory() as tempdir:
+    if _KEEP_TEMPS.present:
+      tempdir_context = NonTemporaryDirectory(dir=_KEEP_TEMPS.value)
+    else:
+      tempdir_context = tempfile.TemporaryDirectory()
+
+    with tempdir_context as tempdir:
       final_cmd_line = loaded_module_spec.build_command_line(tempdir)
       # TODO(mtrofin): remove this once the compiler only generates this by
       # default
@@ -392,14 +424,17 @@ def collect_data(self,
 
       if reward_stat is None:
         default_result = self.compile_fn(
-            final_cmd_line, tf_policy_path='', reward_only=bool(tf_policy_path))
+            final_cmd_line,
+            tf_policy_path='',
+            reward_only=bool(tf_policy_path),
+            workdir=tempdir)
         reward_stat = {
             k: RewardStat(v[1], v[1]) for (k, v) in default_result.items()
         }
 
       if tf_policy_path:
         policy_result = self.compile_fn(
-            final_cmd_line, tf_policy_path, reward_only=False)
+            final_cmd_line, tf_policy_path, reward_only=False, workdir=tempdir)
       else:
         policy_result = default_result
 
@@ -444,7 +479,8 @@ def collect_data(self,
 
   def compile_fn(
       self, command_line: corpus.FullyQualifiedCmdLine, tf_policy_path: str,
-      reward_only: bool) -> Dict[str, Tuple[tf.train.SequenceExample, float]]:
+      reward_only: bool,
+      workdir: str) -> Dict[str, Tuple[tf.train.SequenceExample, float]]:
     """Compiles for the given IR file under the given policy.
 
     Args:
diff --git a/compiler_opt/rl/compilation_runner_test.py b/compiler_opt/rl/compilation_runner_test.py
@@ -77,7 +77,7 @@ def _get_sequence_example(feature_value):
   return text_format.Parse(sequence_example_text, tf.train.SequenceExample())
 
 
-def _mock_compile_fn(file_paths, tf_policy_path, reward_only):  # pylint: disable=unused-argument
+def _mock_compile_fn(file_paths, tf_policy_path, reward_only, workdir):  # pylint: disable=unused-argument
   del file_paths
   if tf_policy_path:
     sequence_example = _get_sequence_example(_POLICY_FEATURE_VALUE)
diff --git a/compiler_opt/rl/inlining/inlining_runner.py b/compiler_opt/rl/inlining/inlining_runner.py
@@ -47,7 +47,8 @@ def __init__(self, llvm_size_path: str, *args, **kwargs):
 
   def compile_fn(
       self, command_line: corpus.FullyQualifiedCmdLine, tf_policy_path: str,
-      reward_only: bool) -> Dict[str, Tuple[tf.train.SequenceExample, float]]:
+      reward_only: bool,
+      workdir: str) -> Dict[str, Tuple[tf.train.SequenceExample, float]]:
     """Run inlining for the given IR file under the given policy.
 
     Args:
@@ -68,56 +69,52 @@ def compile_fn(
       RuntimeError: if llvm-size produces unexpected output.
     """
 
-    working_dir = tempfile.mkdtemp()
+    working_dir = tempfile.mkdtemp(dir=workdir)
 
     log_path = os.path.join(working_dir, 'log')
     output_native_path = os.path.join(working_dir, 'native')
 
     native_size = 0
-    try:
-      cmdline = []
-      if self._launcher_path:
-        cmdline.append(self._launcher_path)
-      cmdline.extend([self._clang_path] + list(command_line) + [
-          '-mllvm', '-enable-ml-inliner=development', '-mllvm',
-          '-training-log=' + log_path, '-o', output_native_path
-      ])
-      if tf_policy_path:
-        cmdline.extend(
-            ['-mllvm', '-ml-inliner-model-under-training=' + tf_policy_path])
-      compilation_runner.start_cancellable_process(cmdline,
-                                                   self._compilation_timeout,
-                                                   self._cancellation_manager)
-      cmdline = [self._llvm_size_path, output_native_path]
-      output_bytes = compilation_runner.start_cancellable_process(
-          cmdline,
-          timeout=self._compilation_timeout,
-          cancellation_manager=self._cancellation_manager,
-          want_output=True)
-      if not output_bytes:
-        raise RuntimeError(f'Empty llvm-size output: {" ".join(cmdline)}')
-      output = output_bytes.decode('utf-8')
-      tmp = output.split('\n')
-      if len(tmp) != 3:
-        raise RuntimeError(f'Wrong llvm-size output {output}')
-      tmp = tmp[1].split('\t')
-      native_size = int(tmp[0])
-
-      if native_size == 0:
-        return {}
-
-      if reward_only:
-        return {_DEFAULT_IDENTIFIER: (None, native_size)}
-
-      result = log_reader.read_log_as_sequence_examples(log_path)
-      if len(result) != 1:
-        return {}
-      sequence_example = next(iter(result.values()))
-
-      if not sequence_example.HasField('feature_lists'):
-        return {}
-
-    finally:
-      tf.io.gfile.rmtree(working_dir)
+    cmdline = []
+    if self._launcher_path:
+      cmdline.append(self._launcher_path)
+    cmdline.extend([self._clang_path] + list(command_line) + [
+        '-mllvm', '-enable-ml-inliner=development', '-mllvm', '-training-log=' +
+        log_path, '-o', output_native_path
+    ])
+    if tf_policy_path:
+      cmdline.extend(
+          ['-mllvm', '-ml-inliner-model-under-training=' + tf_policy_path])
+    compilation_runner.start_cancellable_process(cmdline,
+                                                 self._compilation_timeout,
+                                                 self._cancellation_manager)
+    cmdline = [self._llvm_size_path, output_native_path]
+    output_bytes = compilation_runner.start_cancellable_process(
+        cmdline,
+        timeout=self._compilation_timeout,
+        cancellation_manager=self._cancellation_manager,
+        want_output=True)
+    if not output_bytes:
+      raise RuntimeError(f'Empty llvm-size output: {" ".join(cmdline)}')
+    output = output_bytes.decode('utf-8')
+    tmp = output.split('\n')
+    if len(tmp) != 3:
+      raise RuntimeError(f'Wrong llvm-size output {output}')
+    tmp = tmp[1].split('\t')
+    native_size = int(tmp[0])
+
+    if native_size == 0:
+      return {}
+
+    if reward_only:
+      return {_DEFAULT_IDENTIFIER: (None, native_size)}
+
+    result = log_reader.read_log_as_sequence_examples(log_path)
+    if len(result) != 1:
+      return {}
+    sequence_example = next(iter(result.values()))
+
+    if not sequence_example.HasField('feature_lists'):
+      return {}
 
     return {_DEFAULT_IDENTIFIER: (sequence_example, native_size)}
diff --git a/compiler_opt/rl/regalloc/regalloc_runner.py b/compiler_opt/rl/regalloc/regalloc_runner.py
@@ -42,7 +42,8 @@ class RegAllocRunner(compilation_runner.CompilationRunner):
   # construction
   def compile_fn(
       self, command_line: corpus.FullyQualifiedCmdLine, tf_policy_path: str,
-      reward_only: bool) -> Dict[str, Tuple[tf.train.SequenceExample, float]]:
+      reward_only: bool,
+      workdir: str) -> Dict[str, Tuple[tf.train.SequenceExample, float]]:
     """Run inlining for the given IR file under the given policy.
 
     Args:
@@ -63,43 +64,39 @@ def compile_fn(
       RuntimeError: if llvm-size produces unexpected output.
     """
 
-    working_dir = tempfile.mkdtemp()
+    working_dir = tempfile.mkdtemp(dir=workdir)
 
     log_path = os.path.join(working_dir, 'log')
     output_native_path = os.path.join(working_dir, 'native')
 
     result = {}
-    try:
-      cmdline = []
-      if self._launcher_path:
-        cmdline.append(self._launcher_path)
-      cmdline.extend([self._clang_path] + list(command_line) + [
-          '-mllvm', '-regalloc-enable-advisor=development', '-mllvm',
-          '-regalloc-training-log=' + log_path, '-o', output_native_path
-      ])
-
-      if tf_policy_path:
-        cmdline.extend(['-mllvm', '-regalloc-model=' + tf_policy_path])
-      compilation_runner.start_cancellable_process(cmdline,
-                                                   self._compilation_timeout,
-                                                   self._cancellation_manager)
-
-      # TODO(#202)
-      log_result = log_reader.read_log_as_sequence_examples(log_path)
-
-      for fct_name, trajectory in log_result.items():
-        if not trajectory.HasField('feature_lists'):
-          continue
-        score = (
-            trajectory.feature_lists.feature_list['reward'].feature[-1]
-            .float_list.value[0])
-        if reward_only:
-          result[fct_name] = (None, score)
-        else:
-          del trajectory.feature_lists.feature_list['reward']
-          result[fct_name] = (trajectory, score)
-
-    finally:
-      tf.io.gfile.rmtree(working_dir)
+    cmdline = []
+    if self._launcher_path:
+      cmdline.append(self._launcher_path)
+    cmdline.extend([self._clang_path] + list(command_line) + [
+        '-mllvm', '-regalloc-enable-advisor=development', '-mllvm',
+        '-regalloc-training-log=' + log_path, '-o', output_native_path
+    ])
+
+    if tf_policy_path:
+      cmdline.extend(['-mllvm', '-regalloc-model=' + tf_policy_path])
+    compilation_runner.start_cancellable_process(cmdline,
+                                                 self._compilation_timeout,
+                                                 self._cancellation_manager)
+
+    # TODO(#202)
+    log_result = log_reader.read_log_as_sequence_examples(log_path)
+
+    for fct_name, trajectory in log_result.items():
+      if not trajectory.HasField('feature_lists'):
+        continue
+      score = (
+          trajectory.feature_lists.feature_list['reward'].feature[-1].float_list
+          .value[0])
+      if reward_only:
+        result[fct_name] = (None, score)
+      else:
+        del trajectory.feature_lists.feature_list['reward']
+        result[fct_name] = (trajectory, score)
 
     return result
diff --git a/compiler_opt/rl/regalloc_priority/regalloc_priority_runner.py b/compiler_opt/rl/regalloc_priority/regalloc_priority_runner.py
@@ -31,50 +31,46 @@ class RegAllocPriorityRunner(compilation_runner.CompilationRunner):
 
   def _compile_fn(
       self, file_paths: Tuple[str, ...], tf_policy_path: str, reward_only: bool,
-      cancellation_manager: Optional[
+      workdir: str, cancellation_manager: Optional[
           compilation_runner.WorkerCancellationManager]
   ) -> Dict[str, Tuple[tf.train.SequenceExample, float]]:
 
     file_paths = file_paths[0].replace('.bc', '')
-    working_dir = tempfile.mkdtemp()
+    working_dir = tempfile.mkdtemp(dir=workdir)
 
     log_path = os.path.join(working_dir, 'log')
     output_native_path = os.path.join(working_dir, 'native')
 
     result = {}
-    try:
-      command_line = []
-      if self._launcher_path:
-        command_line.append(self._launcher_path)
-      command_line.extend([self._clang_path] + [
-          '-c', file_paths, '-O3', '-mllvm', '-regalloc-priority-training-log='
-          + log_path, '-mllvm', '-regalloc-enable-priority-advisor=development',
-          '-o', output_native_path
-      ])
+    command_line = []
+    if self._launcher_path:
+      command_line.append(self._launcher_path)
+    command_line.extend([self._clang_path] + [
+        '-c', file_paths, '-O3', '-mllvm', '-regalloc-priority-training-log=' +
+        log_path, '-mllvm', '-regalloc-enable-priority-advisor=development',
+        '-o', output_native_path
+    ])
 
-      if tf_policy_path:
-        command_line.extend(
-            ['-mllvm', '-regalloc-priority-model=' + tf_policy_path])
-      compilation_runner.start_cancellable_process(command_line,
-                                                   self._compilation_timeout,
-                                                   cancellation_manager)
+    if tf_policy_path:
+      command_line.extend(
+          ['-mllvm', '-regalloc-priority-model=' + tf_policy_path])
+    compilation_runner.start_cancellable_process(command_line,
+                                                 self._compilation_timeout,
+                                                 cancellation_manager)
 
-      # TODO(#202)
-      log_result = log_reader.read_log_as_sequence_examples(log_path)
+    # TODO(#202)
+    log_result = log_reader.read_log_as_sequence_examples(log_path)
 
-      for fct_name, trajectory in log_result.items():
-        if not trajectory.HasField('feature_lists'):
-          continue
-        score = (
-            trajectory.feature_lists.feature_list['reward'].feature[-1]
-            .float_list.value[0])
-        if reward_only:
-          result[fct_name] = (None, score)
-        else:
-          del trajectory.feature_lists.feature_list['reward']
-          result[fct_name] = (trajectory, score)
-
-    finally:
-      tf.io.gfile.rmtree(working_dir)
+    for fct_name, trajectory in log_result.items():
+      if not trajectory.HasField('feature_lists'):
+        continue
+      score = (
+          trajectory.feature_lists.feature_list['reward'].feature[-1].float_list
+          .value[0])
+      if reward_only:
+        result[fct_name] = (None, score)
+      else:
+        del trajectory.feature_lists.feature_list['reward']
+        result[fct_name] = (trajectory, score)
 
     return result