Skip to content

Commit 8f1f1a9

Browse files
Add support for copying corpora to local disk
This patch adds support in regalloc_trace_worker for copying the corpus to the local disk at initilization time. This is necessary for certain filesystems that can only be interacted with through specific APIs, precluding their direct use for applications like clang. Reviewers: mtrofin Reviewed By: mtrofin Pull Request: #465
1 parent f7e7249 commit 8f1f1a9

File tree

2 files changed

+128
-8
lines changed

2 files changed

+128
-8
lines changed

compiler_opt/es/regalloc_trace/regalloc_trace_worker.py

Lines changed: 66 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,22 @@
2727
import concurrent.futures
2828
import tempfile
2929
import shutil
30+
from typing import Any
3031

3132
import gin
33+
import tensorflow as tf
3234

3335
from compiler_opt.rl import corpus
3436
from compiler_opt.distributed import worker
3537
from compiler_opt.rl import policy_saver
3638
from compiler_opt.es import policy_utils
3739

3840

41+
def _make_dirs_and_copy(old_file_path: str, new_file_path: str):
42+
tf.io.gfile.makedirs(os.path.dirname(new_file_path))
43+
tf.io.gfile.copy(old_file_path, new_file_path)
44+
45+
3946
@gin.configurable
4047
class RegallocTraceWorker(worker.Worker):
4148
"""A worker that produces rewards for a given regalloc policy.
@@ -53,9 +60,55 @@ def _setup_base_policy(self):
5360
saver.save(self._tf_base_temp_dir)
5461
self._tf_base_policy_path = os.path.join(self._tf_base_temp_dir, "policy")
5562

56-
def __init__(self, *, gin_config: str, clang_path: str,
57-
basic_block_trace_model_path: str, thread_count: int,
58-
corpus_path: str):
63+
def _copy_corpus(self, corpus_path: str,
64+
copy_corpus_locally_path: str | None) -> None:
65+
"""Makes a local copy of the corpus if requested.
66+
67+
This function makes a local copy of the corpus by copying the remote
68+
corpus to a user-specified directory.
69+
70+
Args:
71+
corpus_path: The path to the remote corpus.
72+
copy_corpus_locally: The local path to copy the corpus to.
73+
"""
74+
# We use the tensorflow APIs below rather than the standard Python file
75+
# APIs for compatibility with more filesystems.
76+
77+
if tf.io.gfile.exists(copy_corpus_locally_path):
78+
return
79+
80+
with tf.io.gfile.GFile(
81+
os.path.join(corpus_path, "corpus_description.json"),
82+
"r") as corpus_description_file:
83+
corpus_description: dict[str, Any] = json.load(corpus_description_file)
84+
85+
file_extensions_to_copy = [".bc", ".cmd"]
86+
if corpus_description["has_thinlto"]:
87+
file_extensions_to_copy.append(".thinlto.bc")
88+
89+
copy_futures = []
90+
with concurrent.futures.ThreadPoolExecutor(self._thread_count *
91+
5) as copy_thread_pool:
92+
for module in corpus_description["modules"]:
93+
for extension in file_extensions_to_copy:
94+
current_path = os.path.join(corpus_path, module + extension)
95+
new_path = os.path.join(copy_corpus_locally_path, module + extension)
96+
copy_futures.append(
97+
copy_thread_pool.submit(_make_dirs_and_copy, current_path,
98+
new_path))
99+
100+
for copy_future in copy_futures:
101+
if copy_future.exception() is not None:
102+
raise copy_future.exception()
103+
104+
def __init__(self,
105+
*,
106+
gin_config: str,
107+
clang_path: str,
108+
basic_block_trace_model_path: str,
109+
thread_count: int,
110+
corpus_path: str,
111+
copy_corpus_locally_path: str | None = None):
59112
"""Initializes the RegallocTraceWorker class.
60113
61114
Args:
@@ -68,11 +121,19 @@ def __init__(self, *, gin_config: str, clang_path: str,
68121
thread_count: The number of threads to use for concurrent compilation
69122
and modelling.
70123
corpus_path: The path to the corpus that modules will be compiled from.
124+
copy_corpus_locally_path: If set, specifies the path that the corpus
125+
should be copied to before utilizing the modules for evaluation.
126+
Setting this to None signifies that no copying is desired.
71127
"""
72128
self._clang_path = clang_path
73129
self._basic_block_trace_model_path = basic_block_trace_model_path
74130
self._thread_count = thread_count
131+
self._has_local_corpus = False
75132
self._corpus_path = corpus_path
133+
if copy_corpus_locally_path is not None:
134+
self._copy_corpus(corpus_path, copy_corpus_locally_path)
135+
self._corpus_path = copy_corpus_locally_path
136+
self._has_local_corpus = True
76137

77138
gin.parse_config(gin_config)
78139
self._setup_base_policy()
@@ -83,6 +144,8 @@ def __init__(self, *, gin_config: str, clang_path: str,
83144
# have tempdirs wiped periodically.
84145
def __del__(self):
85146
shutil.rmtree(self._tf_base_temp_dir)
147+
if self._has_local_corpus:
148+
shutil.rmtree(self._corpus_path)
86149

87150
def _compile_module(self, module_to_compile: corpus.ModuleSpec,
88151
output_directory: str, tflite_policy_path: str | None):

compiler_opt/es/regalloc_trace/regalloc_trace_worker_test.py

Lines changed: 62 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
import os
1717
import json
18+
from pathlib import Path
1819
import stat
1920
import textwrap
2021

@@ -26,15 +27,16 @@
2627
from compiler_opt.rl import corpus
2728

2829

29-
def _setup_corpus(corpus_dir: str) -> list[corpus.ModuleSpec]:
30+
def _setup_corpus(corpus_dir: str,
31+
has_thinlto: bool = False) -> list[corpus.ModuleSpec]:
3032
modules = [
31-
corpus.ModuleSpec("module_a", 1, ("-fmodule-a",), True),
32-
corpus.ModuleSpec("module_b", 1, ("-fmodule-b",), True)
33+
corpus.ModuleSpec("module_a.o", 1, ("-fmodule-a",), True),
34+
corpus.ModuleSpec("module_b.o", 1, ("-fmodule-b",), True)
3335
]
3436

3537
corpus_description = {
36-
"has_thinlto": True,
37-
"modules": [os.path.join(corpus_dir, module.name) for module in modules]
38+
"has_thinlto": has_thinlto,
39+
"modules": [module.name for module in modules]
3840
}
3941

4042
with open(
@@ -43,6 +45,15 @@ def _setup_corpus(corpus_dir: str) -> list[corpus.ModuleSpec]:
4345
encoding="utf-8") as corpus_description_handle:
4446
json.dump(corpus_description, corpus_description_handle)
4547

48+
for module in ["module_a.o", "module_b.o"]:
49+
extensions = [".cmd", ".bc"]
50+
if has_thinlto:
51+
extensions.append(".thinlto.bc")
52+
53+
for extension in extensions:
54+
module_path = os.path.join(corpus_dir, module + extension)
55+
Path(module_path).touch()
56+
4657
return modules
4758

4859

@@ -151,3 +162,49 @@ def test_compile_corpus_and_evaluate_with_tflite(self):
151162
self.assertTrue(
152163
"-regalloc-enable-advisor=development" in clang_command_lines[1])
153164
self.assertTrue("-regalloc-model=" in clang_command_lines[1])
165+
166+
def test_copy_corpus_locally(self):
167+
corpus_copy_base_dir = self.create_tempdir("corpus_copy")
168+
corpus_copy_dir = os.path.join(corpus_copy_base_dir.full_path,
169+
"corpus_copy")
170+
corpus_dir = self.create_tempdir("corpus")
171+
_ = _setup_corpus(corpus_dir.full_path)
172+
worker = regalloc_trace_worker.RegallocTraceWorker(
173+
gin_config="",
174+
clang_path="/fake/path/to/clamg",
175+
basic_block_trace_model_path="/fake/path/to/basic_block_trace_model",
176+
thread_count=1,
177+
corpus_path=corpus_dir.full_path,
178+
copy_corpus_locally_path=corpus_copy_dir)
179+
180+
self.assertTrue(
181+
os.path.exists(os.path.join(corpus_copy_dir, "module_a.o.bc")))
182+
self.assertTrue(
183+
os.path.exists(os.path.join(corpus_copy_dir, "module_a.o.cmd")))
184+
self.assertTrue(
185+
os.path.exists(os.path.join(corpus_copy_dir, "module_b.o.bc")))
186+
self.assertTrue(
187+
os.path.exists(os.path.join(corpus_copy_dir, "module_b.o.cmd")))
188+
189+
# Check that the worker cleans up after itself upon deletion.
190+
del worker
191+
self.assertFalse(os.path.exists(corpus_copy_dir))
192+
193+
def test_copy_corpus_locally_thinlto(self):
194+
corpus_copy_base_dir = self.create_tempdir("corpus_copy")
195+
corpus_copy_dir = os.path.join(corpus_copy_base_dir.full_path,
196+
"corpus_copy")
197+
corpus_dir = self.create_tempdir("corpus")
198+
_ = _setup_corpus(corpus_dir.full_path, True)
199+
_ = regalloc_trace_worker.RegallocTraceWorker(
200+
gin_config="",
201+
clang_path="/fake/path/to/clamg",
202+
basic_block_trace_model_path="/fake/path/to/basic_block_trace_model",
203+
thread_count=1,
204+
corpus_path=corpus_dir.full_path,
205+
copy_corpus_locally_path=corpus_copy_dir)
206+
207+
self.assertTrue(
208+
os.path.exists(os.path.join(corpus_copy_dir, "module_a.o.thinlto.bc")))
209+
self.assertTrue(
210+
os.path.exists(os.path.join(corpus_copy_dir, "module_b.o.thinlto.bc")))

0 commit comments

Comments
 (0)