Skip to content

Commit 4346e17

Browse files
Add make_corpus utility
This patch adds in a make_corpus.py script (along with the associated library and unit test) that allows for the creation of a corpus from a directory containing LLVM bitcode files.
1 parent 22c7154 commit 4346e17

File tree

3 files changed

+206
-0
lines changed

3 files changed

+206
-0
lines changed

compiler_opt/tools/make_corpus.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# coding=utf-8
2+
# Copyright 2020 Google LLC
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""Tool for making a corpus from arbitrary bitcode.
16+
17+
To create a corpus from a set of bitcode files in an input directory, run
18+
the following command:
19+
20+
PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \
21+
--input_dir=<path to input directory> \
22+
--output_dir=<path to output direcotry> \
23+
--default_flags="<list of space separated flags>"
24+
"""
25+
26+
from absl import app
27+
from absl import flags
28+
from absl import logging
29+
30+
from compiler_opt.tools import make_corpus_lib
31+
32+
flags.DEFINE_string('input_dir', None, 'The input directory.')
33+
flags.DEFINE_string('output_dir', None, 'The output directory.')
34+
flags.DEFINE_string(
35+
'default_flags', '',
36+
'The compiler flags to compile with when using downstream tooling.')
37+
38+
flags.mark_flag_as_required('input_dir')
39+
flags.mark_flag_as_required('output_dir')
40+
41+
FLAGS = flags.FLAGS
42+
43+
44+
def main(_):
45+
logging.warn('Using this tool does not guarnatee that the bitcode is taken at'
46+
'the correct stage for consumption during model training. Make'
47+
'sure to validate assumptions about where the bitcode is coming'
48+
'from before using it in production.')
49+
relative_paths = make_corpus_lib.load_bitcode_from_directory(FLAGS.input_dir)
50+
make_corpus_lib.copy_bitcode(relative_paths, FLAGS.input_dir,
51+
FLAGS.output_dir)
52+
make_corpus_lib.write_corpus_manifest(relative_paths, FLAGS.output_dir,
53+
FLAGS.default_flags.split())
54+
55+
56+
if __name__ == '__main__':
57+
app.run(main)

compiler_opt/tools/make_corpus_lib.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
# coding=utf-8
2+
# Copyright 2020 Google LLC
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""Library functions for making a corpus from arbitrary bitcode."""
16+
17+
import pathlib
18+
import os
19+
import shutil
20+
import json
21+
22+
from typing import List, Optional
23+
24+
25+
def load_bitcode_from_directory(bitcode_base_dir: str):
26+
"""Finds bitcode files to extract from a given directory.
27+
28+
Args:
29+
bitcode_base_dir: The base directory where the bitcode to be copied
30+
is from.
31+
output_dir: The directory to place the bitcode in.
32+
33+
Returns an array of paths representing the relative path to the bitcode
34+
file from the base direcotry.
35+
"""
36+
paths = [str(p) for p in pathlib.Path(bitcode_base_dir).glob('**/*.bc')]
37+
38+
return [
39+
os.path.relpath(full_path, start=bitcode_base_dir) for full_path in paths
40+
]
41+
42+
43+
def copy_bitcode(relative_paths: List[str], bitcode_base_dir: str,
44+
output_dir: str):
45+
"""Copies bitcode files from the base directory to the output directory.
46+
47+
Args:
48+
relative_paths: An array of relative paths to bitcode files that are copied
49+
over to the output directory, preserving relative location.
50+
bitcode_base_dir: The base directory where the bitcode is located.
51+
output_dir: The output directory to place the bitcode in.
52+
"""
53+
for relative_path in relative_paths:
54+
base_path = os.path.join(bitcode_base_dir, relative_path)
55+
destination_path = os.path.join(output_dir, relative_path)
56+
os.makedirs(os.path.dirname(destination_path), exist_ok=True)
57+
shutil.copy(base_path, destination_path)
58+
59+
60+
def write_corpus_manifest(relative_output_paths: List[str],
61+
output_dir: str,
62+
default_flags: Optional[List[str]] = None):
63+
"""Creates a corpus manifest describing the bitcode that has been found.
64+
65+
Args:
66+
relative_output_paths: A list of paths to each bitcode file relative to the
67+
output directory.
68+
outout_dir: The output directory where the corpus is being created.
69+
default_flags: An array of compiler flags that should be used to compile
70+
the bitcode when using further downstream tooling."""
71+
if default_flags is None:
72+
default_flags = []
73+
corpus_description = {
74+
'global_command_override': default_flags,
75+
'has_thinlto': False,
76+
'modules': [path for path in relative_output_paths if path is not None]
77+
}
78+
79+
with open(
80+
os.path.join(output_dir, 'corpus_description.json'),
81+
'w',
82+
encoding='utf-8') as description_file:
83+
json.dump(corpus_description, description_file, indent=2)
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# coding=utf-8
2+
# Copyright 2020 Google LLC
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""Test for compiler_opt.tools.make_corpus_lib"""
16+
17+
import json
18+
import os
19+
20+
from absl.testing import absltest
21+
22+
from compiler_opt.tools import make_corpus_lib
23+
24+
25+
class MakeCorpusTest(absltest.TestCase):
26+
27+
def test_load_bitcode_from_directory(self):
28+
outer = self.create_tempdir()
29+
tempdir = outer.mkdir(dir_path='nested')
30+
tempdir.create_file('test1.bc')
31+
tempdir.create_file('test2.bc')
32+
relative_paths = make_corpus_lib.load_bitcode_from_directory(outer)
33+
relative_paths = sorted(relative_paths)
34+
self.assertEqual(relative_paths[0], 'nested/test1.bc')
35+
self.assertEqual(relative_paths[1], 'nested/test2.bc')
36+
37+
def test_copy_bitcode(self):
38+
build_dir = self.create_tempdir()
39+
nested_dir = build_dir.mkdir(dir_path='nested')
40+
nested_dir.create_file('test1.bc')
41+
nested_dir.create_file('test2.bc')
42+
relative_paths = ['nested/test1.bc', 'nested/test2.bc']
43+
corpus_dir = self.create_tempdir()
44+
make_corpus_lib.copy_bitcode(relative_paths, build_dir, corpus_dir)
45+
output_files = sorted(os.listdir(os.path.join(corpus_dir, './nested')))
46+
self.assertEqual(output_files[0], 'test1.bc')
47+
self.assertEqual(output_files[1], 'test2.bc')
48+
49+
def test_write_corpus_manifest(self):
50+
relative_output_paths = ['test/test1.bc', 'test/test2.bc']
51+
output_dir = self.create_tempdir()
52+
default_flags = ['-O3', '-c']
53+
make_corpus_lib.write_corpus_manifest(relative_output_paths, output_dir,
54+
default_flags)
55+
with open(
56+
os.path.join(output_dir, 'corpus_description.json'),
57+
encoding='utf-8') as corpus_description_file:
58+
corpus_description = json.load(corpus_description_file)
59+
self.assertEqual(corpus_description['global_command_override'],
60+
default_flags)
61+
self.assertEqual(corpus_description['has_thinlto'], False)
62+
self.assertEqual(corpus_description['modules'], relative_output_paths)
63+
64+
65+
if __name__ == '__main__':
66+
absltest.main()

0 commit comments

Comments
 (0)