Skip to content

Commit dafc347

Browse files
Refactor functionality of extract_ir.py into a library (#267)
This commit refactors most of the functional elements of extract_ir.py into a library. This serves a couple of purposes. It makes some elements of the code base a little bit cleaner (no more hacky flag tricks with absl to get things working in unit tests) and can be seen as making some things slightly more readable. In addition to this, it enables use of these functions in other areas, particularly downstream projects within the compiler_opt python package.
1 parent f45ed53 commit dafc347

File tree

3 files changed

+352
-294
lines changed

3 files changed

+352
-294
lines changed

compiler_opt/tools/extract_ir.py

Lines changed: 15 additions & 272 deletions
Original file line numberDiff line numberDiff line change
@@ -32,19 +32,12 @@
3232

3333
import json
3434
import multiprocessing
35-
import os
36-
import pathlib
37-
import re
38-
import shutil
39-
import subprocess
40-
41-
from typing import Dict, List, Optional
4235

4336
from absl import app
4437
from absl import flags
4538
from absl import logging
4639

47-
from compiler_opt.rl import constant
40+
from compiler_opt.tools import extract_ir_lib
4841

4942
flags.DEFINE_string(
5043
'input', None,
@@ -91,241 +84,6 @@
9184
FLAGS = flags.FLAGS
9285

9386

94-
# TODO(ml-compiler-opt): maybe we can also convert here the cmdline file,from a
95-
# \0 - separated list of strings, to a \n one.
96-
def should_include_module(cmdline: str, match_regexp: Optional[str]) -> bool:
97-
"""Determine if the module should be included."""
98-
if match_regexp is None:
99-
return True
100-
lines = cmdline.split('\0')
101-
return any(len(re.findall(match_regexp, l)) for l in lines)
102-
103-
104-
def get_thinlto_index(cmdline: str, basedir: str) -> Optional[str]:
105-
opts = cmdline.split('\0')
106-
for option in opts:
107-
if option.startswith('-fthinlto-index'):
108-
return os.path.join(basedir, option.split('=')[1])
109-
return None
110-
111-
112-
class TrainingIRExtractor:
113-
"""IR and command line extraction from an object file."""
114-
115-
def __init__(self, obj_relative_path, output_base_dir, obj_base_dir=None):
116-
"""Set up a TrainingIRExtractor.
117-
118-
Args:
119-
obj_relative_path: relative path to the input object file. It will be also
120-
used to construct the absolute path of the output IR and cmd files, by
121-
appending it to output_base_dir.
122-
output_base_dir: the directory under which the output will be produced.
123-
obj_base_dir: the base directory for all the input object files.
124-
"""
125-
self._obj_relative_path = obj_relative_path
126-
self._output_base_dir = output_base_dir
127-
self._obj_base_dir = obj_base_dir if obj_base_dir is not None else ''
128-
129-
def obj_base_dir(self):
130-
return self._obj_base_dir
131-
132-
def output_base_dir(self):
133-
return self._output_base_dir
134-
135-
def relative_output_path(self):
136-
return self._obj_relative_path
137-
138-
def input_obj(self):
139-
return os.path.join(self.obj_base_dir(), self._obj_relative_path)
140-
141-
def lld_src_bc(self):
142-
# .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
143-
# IR bitcode saved by lld. It is hardcoded into lld.
144-
return os.path.join(self._obj_base_dir,
145-
self._obj_relative_path + '.3.import.bc')
146-
147-
def lld_src_thinlto(self):
148-
return os.path.join(self._obj_base_dir,
149-
self._obj_relative_path + '.thinlto.bc')
150-
151-
def dest_dir(self):
152-
return os.path.join(self.output_base_dir(),
153-
os.path.dirname(self._obj_relative_path))
154-
155-
def module_name(self):
156-
return os.path.basename(self._obj_relative_path)
157-
158-
def cmd_file(self):
159-
return os.path.join(self.dest_dir(), self.module_name() + '.cmd')
160-
161-
def bc_file(self):
162-
return os.path.join(self.dest_dir(), self.module_name() + '.bc')
163-
164-
def thinlto_index_file(self):
165-
return os.path.join(self.dest_dir(), self.module_name() + '.thinlto.bc')
166-
167-
def _get_extraction_cmd_command(self, llvm_objcopy_path):
168-
"""Call llvm_objcopy to extract the llvmcmd section in self._cmd_file."""
169-
return [
170-
llvm_objcopy_path,
171-
'--dump-section=' + FLAGS.cmd_section_name + '=' + self.cmd_file(),
172-
self.input_obj(), '/dev/null'
173-
]
174-
175-
def _get_extraction_bc_command(self, llvm_objcopy_path):
176-
"""Call llvm_objcopy to extract the llvmbc section in self._bc_file."""
177-
return [
178-
llvm_objcopy_path,
179-
'--dump-section=' + FLAGS.bitcode_section_name + '=' + self.bc_file(),
180-
self.input_obj(), '/dev/null'
181-
]
182-
183-
def _extract_clang_artifacts(self, llvm_objcopy_path: str, cmd_filter: str,
184-
is_thinlto: bool) -> Optional[str]:
185-
"""Run llvm-objcopy to extract the .bc and command line."""
186-
if not os.path.exists(self.input_obj()):
187-
logging.info('%s does not exist.', self.input_obj())
188-
return None
189-
os.makedirs(self.dest_dir(), exist_ok=True)
190-
try:
191-
subprocess.run(
192-
self._get_extraction_cmd_command(llvm_objcopy_path), check=True)
193-
if cmd_filter is not None or is_thinlto:
194-
with open(self.cmd_file(), encoding='utf-8') as f:
195-
lines = f.readlines()
196-
assert len(lines) == 1
197-
cmdline = lines[0]
198-
if not should_include_module(cmdline, cmd_filter):
199-
logging.info(
200-
'Excluding module %s because it does not match the filter',
201-
self.input_obj())
202-
os.remove(self.cmd_file())
203-
return None
204-
if is_thinlto:
205-
index_file = get_thinlto_index(cmdline, self.obj_base_dir())
206-
shutil.copy(index_file, self.thinlto_index_file())
207-
208-
subprocess.run(
209-
self._get_extraction_bc_command(llvm_objcopy_path), check=True)
210-
except subprocess.CalledProcessError as e:
211-
# This may happen if .o file was build from asm (.S source).
212-
logging.warning('%s was not processed: %s', self.input_obj(), e)
213-
return None
214-
assert (os.path.exists(self.cmd_file()) and
215-
os.path.exists(self.bc_file()) and
216-
(not is_thinlto or os.path.exists(self.thinlto_index_file())))
217-
return self.relative_output_path()
218-
219-
def _extract_lld_artifacts(self) -> Optional[str]:
220-
"""Extract the .bc file with ThinLTO index from an lld ThinLTO invocation.
221-
"""
222-
if not os.path.exists(self.lld_src_bc()):
223-
logging.info('%s does not exist.', self.lld_src_bc())
224-
return None
225-
if not os.path.exists(self.lld_src_thinlto()):
226-
logging.info('%s does not exist.', self.lld_src_thinlto())
227-
return None
228-
os.makedirs(self.dest_dir(), exist_ok=True)
229-
230-
# Copy over the files
231-
shutil.copy(self.lld_src_bc(), self.bc_file())
232-
shutil.copy(self.lld_src_thinlto(), self.thinlto_index_file())
233-
234-
assert os.path.exists(self.bc_file())
235-
assert os.path.exists(self.thinlto_index_file())
236-
return self._obj_relative_path
237-
238-
def extract(self,
239-
llvm_objcopy_path: Optional[str] = None,
240-
cmd_filter: Optional[str] = None,
241-
thinlto_build: Optional[str] = None) -> Optional[str]:
242-
if thinlto_build == 'local':
243-
return self._extract_lld_artifacts()
244-
return self._extract_clang_artifacts(
245-
llvm_objcopy_path=llvm_objcopy_path,
246-
cmd_filter=cmd_filter,
247-
is_thinlto=thinlto_build == 'distributed')
248-
249-
250-
def convert_compile_command_to_objectfile(
251-
command: Dict[str, str], output_dir: str) -> Optional[TrainingIRExtractor]:
252-
obj_base_dir = command['directory']
253-
cmd = command['command']
254-
255-
cmd_parts = cmd.split()
256-
try:
257-
obj_index = cmd_parts.index('-o') + 1
258-
except ValueError:
259-
# This could happen if there are non-clang commands in compile_commands.json
260-
logging.info('Command has no -o option: %s', cmd)
261-
return None
262-
obj_rel_path = cmd_parts[obj_index]
263-
# TODO(mtrofin): is the obj_base_dir correct for thinlto index bc files?
264-
return TrainingIRExtractor(
265-
obj_relative_path=obj_rel_path,
266-
output_base_dir=output_dir,
267-
obj_base_dir=obj_base_dir)
268-
269-
270-
def load_from_compile_commands(json_array: List[Dict[str, str]],
271-
output_dir: str) -> List[TrainingIRExtractor]:
272-
objs = [
273-
convert_compile_command_to_objectfile(cmd, output_dir)
274-
for cmd in json_array
275-
]
276-
# Filter out None, in case there were non-clang commands in the .json
277-
return [obj for obj in objs if obj is not None]
278-
279-
280-
def load_from_lld_params(params_array: List[str], obj_base_dir: str,
281-
output_dir: str) -> List[TrainingIRExtractor]:
282-
"""Create an ObjectFile array based on lld's parameters."""
283-
# yank out -o and the output. After that, anything not starting with '-', and
284-
# ending in a '.o', is an object file.
285-
try:
286-
minus_o_idx = params_array.index('-o')
287-
del params_array[minus_o_idx:minus_o_idx + 2]
288-
just_obj_paths = [
289-
o for o in params_array if not o.startswith('-') and o.endswith('.o')
290-
]
291-
except ValueError:
292-
logging.info('This params file does not have an explicit -o option.')
293-
just_obj_paths = params_array
294-
295-
def make_obj(obj_file: str) -> TrainingIRExtractor:
296-
return TrainingIRExtractor(
297-
obj_relative_path=obj_file,
298-
output_base_dir=output_dir,
299-
obj_base_dir=obj_base_dir)
300-
301-
return [make_obj(obj_file) for obj_file in just_obj_paths]
302-
303-
304-
def load_for_lld_thinlto(obj_base_dir: str,
305-
output_dir: str) -> List[TrainingIRExtractor]:
306-
# .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
307-
# IR bitcode saved by lld. It is hardcoded into lld. ThinLTO index files
308-
# are also emitted next to the postimport bitcode, with the suffix
309-
# .thinlto.bc instead
310-
paths = [str(p) for p in pathlib.Path(obj_base_dir).glob('**/*.3.import.bc')]
311-
312-
def make_spec(obj_file: str):
313-
return TrainingIRExtractor(
314-
# Cut away .3.import.bc
315-
obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir)[:-12],
316-
output_base_dir=output_dir,
317-
obj_base_dir=obj_base_dir)
318-
319-
return [make_spec(path) for path in paths]
320-
321-
322-
# This is here just for readability, lint complains if the pooling expression is
323-
# over 3 lines; and it needs to be a non-local so it may be pickled.
324-
def extract_artifacts(obj: TrainingIRExtractor) -> Optional[str]:
325-
return obj.extract(FLAGS.llvm_objcopy_path, FLAGS.cmd_filter,
326-
FLAGS.thinlto_build)
327-
328-
32987
def main(argv):
33088
if len(argv) > 1:
33189
raise app.UsageError('Too many command-line arguments.')
@@ -336,49 +94,34 @@ def main(argv):
33694
if FLAGS.input is None:
33795
if FLAGS.thinlto_build != 'local':
33896
raise ValueError('--input or --thinlto_build=local must be provided')
339-
objs = load_for_lld_thinlto(FLAGS.obj_base_dir, FLAGS.output_dir)
97+
objs = extract_ir_lib.load_for_lld_thinlto(FLAGS.obj_base_dir,
98+
FLAGS.output_dir)
34099
elif FLAGS.input_type == 'json':
341100
with open(FLAGS.input, encoding='utf-8') as f:
342-
objs = load_from_compile_commands(json.load(f), FLAGS.output_dir)
101+
objs = extract_ir_lib.load_from_compile_commands(
102+
json.load(f), FLAGS.output_dir)
343103
elif FLAGS.input_type == 'params':
344104
if not FLAGS.obj_base_dir:
345105
logging.info(
346106
'-obj_base_dir is unspecified, assuming current directory.'
347107
'If no objects are found, use this option to specify the root'
348108
'directory for the object file paths in the input file.')
349109
with open(FLAGS.input, encoding='utf-8') as f:
350-
objs = load_from_lld_params([l.strip() for l in f.readlines()],
351-
FLAGS.obj_base_dir, FLAGS.output_dir)
110+
objs = extract_ir_lib.load_from_lld_params(
111+
[l.strip() for l in f.readlines()], FLAGS.obj_base_dir,
112+
FLAGS.output_dir)
352113
else:
353114
logging.error('Unknown input type: %s', FLAGS.input_type)
354115

355-
with multiprocessing.Pool(FLAGS.num_workers) as pool:
356-
relative_output_paths = pool.map(extract_artifacts, objs)
357-
pool.close()
358-
pool.join()
359-
360-
# This comes first rather than later so global_command_override is at the top
361-
# of the .json after being written
362-
if FLAGS.thinlto_build == 'local':
363-
corpus_description = {
364-
'global_command_override': constant.UNSPECIFIED_OVERRIDE
365-
}
366-
else:
367-
corpus_description = {}
368-
369-
corpus_description.update({
370-
'has_thinlto': FLAGS.thinlto_build is not None,
371-
'modules': [path for path in relative_output_paths if path is not None]
372-
})
116+
relative_output_paths = extract_ir_lib.run_extraction(
117+
objs, FLAGS.num_workers, FLAGS.llvm_objcopy_path, FLAGS.cmd_filter,
118+
FLAGS.thinlto_build, FLAGS.cmd_section_name, FLAGS.bitcode_section_name)
373119

374-
with open(
375-
os.path.join(FLAGS.output_dir, 'corpus_description.json'),
376-
'w',
377-
encoding='utf-8') as f:
378-
json.dump(corpus_description, f, indent=2)
120+
extract_ir_lib.write_corpus_manifest(FLAGS.thinlto_build,
121+
relative_output_paths, FLAGS.output_dir)
379122

380-
logging.info('Converted %d files out of %d',
381-
len(objs) - relative_output_paths.count(None), len(objs))
123+
logging.info('Converted %d files out of %d',
124+
len(objs) - relative_output_paths.count(None), len(objs))
382125

383126

384127
if __name__ == '__main__':

0 commit comments

Comments
 (0)