Skip to content

Commit 21c230c

Browse files
authored
Add support for extracting lld-thinlto corpora (#58)
1 parent b9f9182 commit 21c230c

File tree

2 files changed

+140
-11
lines changed

2 files changed

+140
-11
lines changed

compiler_opt/tools/extract_ir.py

Lines changed: 82 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,17 @@
2323
The compilation is assumed to have been performed with clang, using
2424
-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
2525
26-
In a ThinLTO case, the compilation is assumed to have been performed specifying
27-
-mllvm -lto-embed-bitcode=post-merge-pre-opt.
26+
In a distributed ThinLTO case, the compilation is assumed to have been performed
27+
specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
28+
29+
In a local ThinLTO case, the compilation is assumedto have been performed
30+
specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
2831
"""
2932

3033
import json
3134
import multiprocessing
3235
import os
36+
import pathlib
3337
import re
3438
import shutil
3539
import subprocess
@@ -59,11 +63,15 @@
5963
'Include only those modules with a command line matching this regexp. '
6064
'Setting it to None for not filtering. Note that the regexp is applied '
6165
'independently for each separate command line option. For example, ^-Oz$ '
62-
'will match Oz - built binaries.')
63-
flags.DEFINE_bool(
64-
'thinlto_build', False, 'Set if the build was ThinLTO, to '
65-
'ensure index files are also copied. The build is assumed to have had'
66-
'-mllvm -lto-embed-bitcode=post-merge-pre-opt passed to clang.')
66+
'will match Oz - built binaries. Does not work with thinlto_build=lld.')
67+
flags.DEFINE_enum(
68+
'thinlto_build', None, ['distributed', 'local'],
69+
'Set if the build was performed with either \'distributed\' or '
70+
'\'local\' ThinLTO. This ensures the thinlto.bc files are also copied. '
71+
'The build is assumed to have had '
72+
'-mllvm -lto-embed-bitcode=post-merge-pre-opt passed in the distributed '
73+
'case, or -Wl,--save-temps=import and -Wl,--thinlto-emit-index-files '
74+
'passed in the local case.')
6775

6876
FLAGS = flags.FLAGS
6977

@@ -118,6 +126,16 @@ def relative_output_path(self):
118126
def input_obj(self):
119127
return os.path.join(self.obj_base_dir(), self._obj_relative_path)
120128

129+
def lld_src_bc(self):
130+
# .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
131+
# IR bitcode saved by lld. It is hardcoded into lld.
132+
return os.path.join(self._obj_base_dir,
133+
self._obj_relative_path + '.3.import.bc')
134+
135+
def lld_src_thinlto(self):
136+
return os.path.join(self._obj_base_dir,
137+
self._obj_relative_path + '.thinlto.bc')
138+
121139
def dest_dir(self):
122140
return os.path.join(self.output_base_dir(),
123141
os.path.dirname(self._obj_relative_path))
@@ -148,8 +166,8 @@ def _get_extraction_bc_command(self, llvm_objcopy_path):
148166
self.input_obj(), '/dev/null'
149167
]
150168

151-
def extract(self, llvm_objcopy_path: str, cmd_filter: str,
152-
is_thinlto: bool) -> Optional[str]:
169+
def _extract_clang_artifacts(self, llvm_objcopy_path: str, cmd_filter: str,
170+
is_thinlto: bool) -> Optional[str]:
153171
"""Run llvm-objcopy to extract the .bc and command line."""
154172
if not os.path.exists(self.input_obj()):
155173
logging.info('%s does not exist.', self.input_obj())
@@ -184,6 +202,36 @@ def extract(self, llvm_objcopy_path: str, cmd_filter: str,
184202
(not is_thinlto or os.path.exists(self.thinlto_index_file())))
185203
return self.relative_output_path()
186204

205+
def _extract_lld_artifacts(self) -> Optional[str]:
206+
"""Extract the .bc file with ThinLTO index from an lld ThinLTO invocation.
207+
"""
208+
if not os.path.exists(self.lld_src_bc()):
209+
logging.info('%s does not exist.', self.lld_src_bc())
210+
return None
211+
if not os.path.exists(self.lld_src_thinlto()):
212+
logging.info('%s does not exist.', self.lld_src_thinlto())
213+
return None
214+
os.makedirs(self.dest_dir(), exist_ok=True)
215+
216+
# Copy over the files
217+
shutil.copy(self.lld_src_bc(), self.bc_file())
218+
shutil.copy(self.lld_src_thinlto(), self.thinlto_index_file())
219+
220+
assert os.path.exists(self.bc_file())
221+
assert os.path.exists(self.thinlto_index_file())
222+
return self._obj_relative_path
223+
224+
def extract(self,
225+
llvm_objcopy_path: Optional[str] = None,
226+
cmd_filter: Optional[str] = None,
227+
thinlto_build: Optional[str] = None) -> Optional[str]:
228+
if thinlto_build == 'local':
229+
return self._extract_lld_artifacts()
230+
return self._extract_clang_artifacts(
231+
llvm_objcopy_path=llvm_objcopy_path,
232+
cmd_filter=cmd_filter,
233+
is_thinlto=thinlto_build == 'distributed')
234+
187235

188236
def convert_compile_command_to_objectfile(command: Dict[str, str],
189237
output_dir: str):
@@ -232,6 +280,24 @@ def make_obj(obj_file: str) -> TrainingIRExtractor:
232280
return [make_obj(obj_file) for obj_file in just_obj_paths]
233281

234282

283+
def load_for_lld_thinlto(obj_base_dir: str,
284+
output_dir: str) -> List[TrainingIRExtractor]:
285+
# .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
286+
# IR bitcode saved by lld. It is hardcoded into lld. ThinLTO index files
287+
# are also emitted next to the postimport bitcode, with the suffix
288+
# .thinlto.bc instead
289+
paths = [str(p) for p in pathlib.Path(obj_base_dir).glob('**/*.3.import.bc')]
290+
291+
def make_spec(obj_file: str):
292+
return TrainingIRExtractor(
293+
# Cut away .3.import.bc
294+
obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir)[:-12],
295+
output_base_dir=output_dir,
296+
obj_base_dir=obj_base_dir)
297+
298+
return [make_spec(path) for path in paths]
299+
300+
235301
# This is here just for readability, lint complains if the pooling expression is
236302
# over 3 lines; and it needs to be a non-local so it may be pickled.
237303
def extract_artifacts(obj: TrainingIRExtractor) -> Optional[str]:
@@ -242,9 +308,14 @@ def extract_artifacts(obj: TrainingIRExtractor) -> Optional[str]:
242308
def main(argv):
243309
if len(argv) > 1:
244310
raise app.UsageError('Too many command-line arguments.')
245-
flags.mark_flags_as_required(['output_dir', 'input'])
311+
flags.mark_flags_as_required(['output_dir'])
312+
246313
objs = []
247-
if FLAGS.input_type == 'json':
314+
if FLAGS.input is None:
315+
if FLAGS.thinlto_build != 'local':
316+
raise ValueError('--input or --thinlto_build=local must be provided')
317+
objs = load_for_lld_thinlto(FLAGS.obj_base_dir, FLAGS.output_dir)
318+
elif FLAGS.input_type == 'json':
248319
with open(FLAGS.input, encoding='utf-8') as f:
249320
objs = load_from_compile_commands(json.load(f), FLAGS.output_dir)
250321
elif FLAGS.input_type == 'params':

compiler_opt/tools/extract_ir_test.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
"""Tests for compiler_opt.tools.extract_ir."""
1616

1717
# pylint: disable=protected-access
18+
import os.path
1819

1920
from absl import flags
2021
from absl.testing import absltest
@@ -116,6 +117,63 @@ def test_lld_params(self):
116117
'/tmp/out/lib/obj1.o.thinlto.bc')
117118
self.assertEqual(obj[1].input_obj(), '/some/path/lib/dir/obj2.o')
118119

120+
def test_lld_thinlto_discovery(self):
121+
tempdir = self.create_tempdir()
122+
tempdir.create_file(file_path='1.3.import.bc')
123+
tempdir.create_file(file_path='2.3.import.bc')
124+
tempdir.create_file(file_path='3.3.import.bc')
125+
tempdir.create_file(file_path='1.thinlto.bc')
126+
tempdir.create_file(file_path='2.thinlto.bc')
127+
tempdir.create_file(file_path='3.thinlto.bc')
128+
outdir = self.create_tempdir()
129+
obj = extract_ir.load_for_lld_thinlto(tempdir.full_path, outdir.full_path)
130+
self.assertLen(obj, 3)
131+
for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
132+
self.assertEqual(o._obj_relative_path, f'{i + 1:d}')
133+
self.assertEqual(o._obj_base_dir, tempdir.full_path)
134+
self.assertEqual(o._output_base_dir, outdir.full_path)
135+
136+
def test_lld_thinlto_discovery_nested(self):
137+
outer = self.create_tempdir()
138+
tempdir = outer.mkdir(dir_path='nest')
139+
tempdir.create_file(file_path='1.3.import.bc')
140+
tempdir.create_file(file_path='2.3.import.bc')
141+
tempdir.create_file(file_path='3.3.import.bc')
142+
tempdir.create_file(file_path='1.thinlto.bc')
143+
tempdir.create_file(file_path='2.thinlto.bc')
144+
tempdir.create_file(file_path='3.thinlto.bc')
145+
outdir = self.create_tempdir()
146+
obj = extract_ir.load_for_lld_thinlto(outer.full_path, outdir.full_path)
147+
self.assertLen(obj, 3)
148+
for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
149+
self.assertEqual(o._obj_relative_path, f'nest/{i + 1:d}')
150+
self.assertEqual(o._obj_base_dir, outer.full_path)
151+
self.assertEqual(o._output_base_dir, outdir.full_path)
152+
153+
def test_lld_thinlto_extraction(self):
154+
outer = self.create_tempdir()
155+
tempdir = outer.mkdir(dir_path='nest')
156+
tempdir.create_file(file_path='1.3.import.bc')
157+
tempdir.create_file(file_path='2.3.import.bc')
158+
tempdir.create_file(file_path='3.3.import.bc')
159+
tempdir.create_file(file_path='1.thinlto.bc')
160+
tempdir.create_file(file_path='2.thinlto.bc')
161+
tempdir.create_file(file_path='3.thinlto.bc')
162+
outdir = self.create_tempdir()
163+
obj = extract_ir.load_for_lld_thinlto(outer.full_path, outdir.full_path)
164+
for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
165+
mod_path = o.extract(thinlto_build='local')
166+
self.assertEqual(mod_path, f'nest/{i + 1:d}')
167+
self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/1.bc')))
168+
self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/2.bc')))
169+
self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/3.bc')))
170+
self.assertTrue(
171+
os.path.exists(os.path.join(outdir.full_path, 'nest/1.thinlto.bc')))
172+
self.assertTrue(
173+
os.path.exists(os.path.join(outdir.full_path, 'nest/2.thinlto.bc')))
174+
self.assertTrue(
175+
os.path.exists(os.path.join(outdir.full_path, 'nest/3.thinlto.bc')))
176+
119177
def test_filtering(self):
120178
cmdline = '-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/out.o'
121179
self.assertTrue(extract_ir.should_include_module(cmdline, None))

0 commit comments

Comments
 (0)