Skip to content

Commit ee72352

Browse files
committed
Add wrapper instead of moving
1 parent 497d8f7 commit ee72352

File tree

6 files changed

+310
-283
lines changed

6 files changed

+310
-283
lines changed

llvm/utils/mlgo-utils/combine_training_corpus.py

100644100755
Lines changed: 7 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,52 +1,9 @@
1-
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2-
# See https://llvm.org/LICENSE.txt for license information.
3-
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4-
r"""Combine multiple training corpus into a single training corpus.
1+
#!/usr/bin/env python3
52

6-
Currently only support the case that multiple corpus share the same
7-
configurables except the "modules" field.
3+
import re
4+
import sys
5+
from mlgo.corpus.combine_training_corpus import parse_args_and_run
6+
if __name__ == '__main__':
7+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
8+
sys.exit(parse_args_and_run())
89

9-
Usage: we'd like to combine training corpus corpus1 and corpus2 into
10-
combinedcorpus; we first structure the files as follows:
11-
12-
combinedcorpus
13-
combinedcorpus/corpus1
14-
combinedcorpus/corpus2
15-
16-
Running this script with
17-
18-
python3 \
19-
compiler_opt/tools/combine_training_corpus.py \
20-
--root_dir=$PATH_TO_combinedcorpus
21-
22-
generates combinedcorpus/corpus_description.json file. In this way corpus1
23-
and corpus2 are combined into combinedcorpus.
24-
"""
25-
26-
import argparse
27-
import logging
28-
29-
from mlgo.corpus import combine_training_corpus_lib
30-
from mlgo.corpus import flags
31-
32-
33-
def parse_args_and_run():
34-
parser = argparse.ArgumentParser(
35-
description="A tool for combining multiple training corpora"
36-
)
37-
parser.add_argument(
38-
"--root_dir", type=str, help="The root dir of module paths to combine."
39-
)
40-
flags.add_verbosity_arguments(parser)
41-
args = parser.parse_args()
42-
main(args)
43-
44-
45-
def main(args):
46-
logging.basicConfig(level=args.verbosity)
47-
48-
combine_training_corpus_lib.combine_corpus(args.root_dir)
49-
50-
51-
if __name__ == "__main__":
52-
parse_args_and_run()

llvm/utils/mlgo-utils/extract_ir.py

100644100755
Lines changed: 7 additions & 182 deletions
Original file line numberDiff line numberDiff line change
@@ -1,184 +1,9 @@
1-
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2-
# See https://llvm.org/LICENSE.txt for license information.
3-
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4-
"""Extract IR for training.
1+
#!/usr/bin/env python3
52

6-
Extract IR for training, either from a compile_commands.json file produced by
7-
cmake, or a linker parameter list file.
3+
import re
4+
import sys
5+
from mlgo.corpus.extract_ir import parse_args_and_run
6+
if __name__ == '__main__':
7+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
8+
sys.exit(parse_args_and_run())
89

9-
Only run with
10-
'python compiler_opt/tools/extract_ir.py ...'
11-
12-
The compilation is assumed to have been performed with clang, using
13-
-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
14-
15-
In a distributed ThinLTO case, the compilation is assumed to have been performed
16-
specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
17-
18-
In a local ThinLTO case, the compilation is assumedto have been performed
19-
specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
20-
21-
To change the logging verbosity, set the --verbosity flag to the desired level.
22-
Setting it to a specific level will enable all messages at that level and
23-
higher. Exact values can be found by invoking the script with --help.
24-
"""
25-
26-
import argparse
27-
import json
28-
import logging
29-
30-
from mlgo.corpus import extract_ir_lib
31-
from mlgo.corpus import flags
32-
33-
34-
def parse_args_and_run():
35-
parser = argparse.ArgumentParser(
36-
description="A tool for making a corpus from build artifacts"
37-
)
38-
parser.add_argument(
39-
"--input",
40-
type=str,
41-
help="Input file or directory - either compile_commands.json, a linker "
42-
"parameter list, or a path to a directory containing object files.",
43-
)
44-
parser.add_argument(
45-
"--input_type",
46-
type=str,
47-
help="Input file type - JSON, LLD params, directory, or bazel aquery.",
48-
choices=["json", "params", "directory", "bazel_aquery"],
49-
default="json",
50-
nargs="?",
51-
)
52-
parser.add_argument("--output_dir", type=str, help="Output directory")
53-
parser.add_argument(
54-
"--num_workers",
55-
type=int,
56-
help="Number of parallel works for objcopy. `None` for maximum available.",
57-
default=None,
58-
nargs="?",
59-
)
60-
parser.add_argument(
61-
"--llvm_objcopy_path",
62-
type=str,
63-
help="Path to llvm-objcopy",
64-
default="llvm-objcopy",
65-
nargs="?",
66-
)
67-
parser.add_argument(
68-
"--obj_base_dir",
69-
type=str,
70-
help="Base directory for object files. Defaults to current working dir.",
71-
default="",
72-
nargs="?",
73-
)
74-
parser.add_argument(
75-
"--cmd_filter",
76-
type=str,
77-
help="Include only those modules with a command line matching this regular "
78-
"expression. Set it to None to not perform any filtering. Note that the "
79-
"regular expression is applied independently for each separate command line "
80-
"option. For example, ^-Oz$ will match Oz built binaries. This does not work "
81-
"with thinlto_build=lld.",
82-
default=None,
83-
nargs="?",
84-
)
85-
parser.add_argument(
86-
"--thinlto_build",
87-
type=str,
88-
help="Set if the build was performed with either 'distributed' or 'local' "
89-
"ThinLTO. This ensures the thinlto.bc files are also copied. The build is "
90-
"assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in "
91-
"the distributed case or -Wl,--save-temps=import and "
92-
"-Wl,--thinlto-emit-index-files passed in the local case",
93-
choices=["distributed", "local"],
94-
default=None,
95-
nargs="?",
96-
)
97-
parser.add_argument(
98-
"--cmd_section_name",
99-
type=str,
100-
help="The section name passed to llvm-objcopy. For ELF object files, the "
101-
"default .llvmcmd is correct. For Mach-O object files, one should use "
102-
"something like __LLVM,__cmdline",
103-
default=".llvmcmd",
104-
nargs="?",
105-
)
106-
parser.add_argument(
107-
"--bitcode_section_name",
108-
type=str,
109-
help="The section name passed to llvm-objcopy. For ELF object files, the "
110-
"default .llvmbc is correct. For Mach-O object files, one should use "
111-
"__LLVM,__bitcode",
112-
default=".llvmbc",
113-
nargs="?",
114-
)
115-
flags.add_verbosity_arguments(parser)
116-
args = parser.parse_args()
117-
main(args)
118-
119-
120-
def main(args):
121-
logging.basicConfig(level=args.verbosity)
122-
123-
objs = []
124-
if args.input is not None and args.thinlto_build == "local":
125-
raise ValueError("--thinlto_build=local cannot be run with --input")
126-
if args.input is None:
127-
if args.thinlto_build != "local":
128-
raise ValueError("--input or --thinlto_build=local must be provided")
129-
objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir)
130-
elif args.input_type == "json":
131-
with open(args.input, encoding="utf-8") as f:
132-
objs = extract_ir_lib.load_from_compile_commands(
133-
json.load(f), args.output_dir
134-
)
135-
elif args.input_type == "params":
136-
if not args.obj_base_dir:
137-
logging.info(
138-
"-obj_base_dir is unspecified, assuming current directory. "
139-
"If no objects are found, use this option to specify the root "
140-
"directory for the object file paths in the input file."
141-
)
142-
with open(args.input, encoding="utf-8") as f:
143-
objs = extract_ir_lib.load_from_lld_params(
144-
[l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir
145-
)
146-
elif args.input_type == "directory":
147-
logging.warning(
148-
"Using the directory input is only recommended if the build system "
149-
"your project uses does not support any structured output that "
150-
"ml-compiler-opt understands. If your build system provides a "
151-
"structured compilation database, use that instead"
152-
)
153-
objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
154-
elif args.input_type == "bazel_aquery":
155-
with open(args.input, encoding="utf-8") as aquery_json_handle:
156-
objs = extract_ir_lib.load_bazel_aquery(
157-
json.load(aquery_json_handle), args.obj_base_dir, args.output_dir
158-
)
159-
else:
160-
logging.error("Unknown input type: %s", args.input_type)
161-
162-
relative_output_paths = extract_ir_lib.run_extraction(
163-
objs,
164-
args.num_workers,
165-
args.llvm_objcopy_path,
166-
args.cmd_filter,
167-
args.thinlto_build,
168-
args.cmd_section_name,
169-
args.bitcode_section_name,
170-
)
171-
172-
extract_ir_lib.write_corpus_manifest(
173-
args.thinlto_build, relative_output_paths, args.output_dir
174-
)
175-
176-
logging.info(
177-
"Converted %d files out of %d",
178-
len(objs) - relative_output_paths.count(None),
179-
len(objs),
180-
)
181-
182-
183-
if __name__ == "__main__":
184-
parse_args_and_run()

llvm/utils/mlgo-utils/make_corpus.py

100644100755
Lines changed: 7 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,9 @@
1-
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2-
# See https://llvm.org/LICENSE.txt for license information.
3-
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4-
"""Tool for making a corpus from arbitrary bitcode.
1+
#!/usr/bin/env python3
52

6-
To create a corpus from a set of bitcode files in an input directory, run
7-
the following command:
3+
import re
4+
import sys
5+
from mlgo.corpus.make_corpus import parse_args_and_run
6+
if __name__ == '__main__':
7+
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
8+
sys.exit(parse_args_and_run())
89

9-
PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \
10-
--input_dir=<path to input directory> \
11-
--output_dir=<path to output directory> \
12-
--default_args="<list of space separated flags>"
13-
"""
14-
15-
import argparse
16-
import logging
17-
18-
from mlgo.corpus import make_corpus_lib
19-
20-
21-
def parse_args_and_run():
22-
parser = argparse.ArgumentParser(
23-
description="A tool for making a corpus from arbitrary bitcode"
24-
)
25-
parser.add_argument("--input_dir", type=str, help="The input directory.")
26-
parser.add_argument("--output_dir", type=str, help="The output directory.")
27-
parser.add_argument(
28-
"--default_args",
29-
type=str,
30-
help="The compiler flags to compile with when using downstream tooling.",
31-
default="",
32-
nargs="?",
33-
)
34-
args = parser.parse_args()
35-
main(args)
36-
37-
38-
def main(args):
39-
logging.warning(
40-
"Using this tool does not guarantee that the bitcode is taken at "
41-
"the correct stage for consumption during model training. Make "
42-
"sure to validate assumptions about where the bitcode is coming "
43-
"from before using it in production."
44-
)
45-
relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir)
46-
make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir)
47-
make_corpus_lib.write_corpus_manifest(
48-
relative_paths, args.output_dir, args.default_args.split()
49-
)
50-
51-
52-
if __name__ == "__main__":
53-
parse_args_and_run()
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2+
# See https://llvm.org/LICENSE.txt for license information.
3+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4+
r"""Combine multiple training corpus into a single training corpus.
5+
6+
Currently only support the case that multiple corpus share the same
7+
configurables except the "modules" field.
8+
9+
Usage: we'd like to combine training corpus corpus1 and corpus2 into
10+
combinedcorpus; we first structure the files as follows:
11+
12+
combinedcorpus
13+
combinedcorpus/corpus1
14+
combinedcorpus/corpus2
15+
16+
Running this script with
17+
18+
python3 \
19+
compiler_opt/tools/combine_training_corpus.py \
20+
--root_dir=$PATH_TO_combinedcorpus
21+
22+
generates combinedcorpus/corpus_description.json file. In this way corpus1
23+
and corpus2 are combined into combinedcorpus.
24+
"""
25+
26+
import argparse
27+
import logging
28+
29+
from mlgo.corpus import combine_training_corpus_lib
30+
from mlgo.corpus import flags
31+
32+
33+
def parse_args_and_run():
34+
parser = argparse.ArgumentParser(
35+
description="A tool for combining multiple training corpora"
36+
)
37+
parser.add_argument(
38+
"--root_dir", type=str, help="The root dir of module paths to combine."
39+
)
40+
flags.add_verbosity_arguments(parser)
41+
args = parser.parse_args()
42+
main(args)
43+
44+
45+
def main(args):
46+
logging.basicConfig(level=args.verbosity)
47+
48+
combine_training_corpus_lib.combine_corpus(args.root_dir)
49+
50+
51+
if __name__ == "__main__":
52+
parse_args_and_run()

0 commit comments

Comments
 (0)