|
1 | | -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
2 | | -# See https://llvm.org/LICENSE.txt for license information. |
3 | | -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
4 | | -"""Extract IR for training. |
| 1 | +#!/usr/bin/env python3 |
5 | 2 |
|
6 | | -Extract IR for training, either from a compile_commands.json file produced by |
7 | | -cmake, or a linker parameter list file. |
| 3 | +import re |
| 4 | +import sys |
| 5 | +from mlgo.corpus.extract_ir import parse_args_and_run |
| 6 | +if __name__ == '__main__': |
| 7 | + sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) |
| 8 | + sys.exit(parse_args_and_run()) |
8 | 9 |
|
9 | | -Only run with |
10 | | -'python compiler_opt/tools/extract_ir.py ...' |
11 | | -
|
12 | | -The compilation is assumed to have been performed with clang, using |
13 | | --fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all) |
14 | | -
|
15 | | -In a distributed ThinLTO case, the compilation is assumed to have been performed |
16 | | -specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt. |
17 | | -
|
18 | | -In a local ThinLTO case, the compilation is assumedto have been performed |
19 | | -specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files |
20 | | -
|
21 | | -To change the logging verbosity, set the --verbosity flag to the desired level. |
22 | | -Setting it to a specific level will enable all messages at that level and |
23 | | -higher. Exact values can be found by invoking the script with --help. |
24 | | -""" |
25 | | - |
26 | | -import argparse |
27 | | -import json |
28 | | -import logging |
29 | | - |
30 | | -from mlgo.corpus import extract_ir_lib |
31 | | -from mlgo.corpus import flags |
32 | | - |
33 | | - |
34 | | -def parse_args_and_run(): |
35 | | - parser = argparse.ArgumentParser( |
36 | | - description="A tool for making a corpus from build artifacts" |
37 | | - ) |
38 | | - parser.add_argument( |
39 | | - "--input", |
40 | | - type=str, |
41 | | - help="Input file or directory - either compile_commands.json, a linker " |
42 | | - "parameter list, or a path to a directory containing object files.", |
43 | | - ) |
44 | | - parser.add_argument( |
45 | | - "--input_type", |
46 | | - type=str, |
47 | | - help="Input file type - JSON, LLD params, directory, or bazel aquery.", |
48 | | - choices=["json", "params", "directory", "bazel_aquery"], |
49 | | - default="json", |
50 | | - nargs="?", |
51 | | - ) |
52 | | - parser.add_argument("--output_dir", type=str, help="Output directory") |
53 | | - parser.add_argument( |
54 | | - "--num_workers", |
55 | | - type=int, |
56 | | - help="Number of parallel works for objcopy. `None` for maximum available.", |
57 | | - default=None, |
58 | | - nargs="?", |
59 | | - ) |
60 | | - parser.add_argument( |
61 | | - "--llvm_objcopy_path", |
62 | | - type=str, |
63 | | - help="Path to llvm-objcopy", |
64 | | - default="llvm-objcopy", |
65 | | - nargs="?", |
66 | | - ) |
67 | | - parser.add_argument( |
68 | | - "--obj_base_dir", |
69 | | - type=str, |
70 | | - help="Base directory for object files. Defaults to current working dir.", |
71 | | - default="", |
72 | | - nargs="?", |
73 | | - ) |
74 | | - parser.add_argument( |
75 | | - "--cmd_filter", |
76 | | - type=str, |
77 | | - help="Include only those modules with a command line matching this regular " |
78 | | - "expression. Set it to None to not perform any filtering. Note that the " |
79 | | - "regular expression is applied independently for each separate command line " |
80 | | - "option. For example, ^-Oz$ will match Oz built binaries. This does not work " |
81 | | - "with thinlto_build=lld.", |
82 | | - default=None, |
83 | | - nargs="?", |
84 | | - ) |
85 | | - parser.add_argument( |
86 | | - "--thinlto_build", |
87 | | - type=str, |
88 | | - help="Set if the build was performed with either 'distributed' or 'local' " |
89 | | - "ThinLTO. This ensures the thinlto.bc files are also copied. The build is " |
90 | | - "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in " |
91 | | - "the distributed case or -Wl,--save-temps=import and " |
92 | | - "-Wl,--thinlto-emit-index-files passed in the local case", |
93 | | - choices=["distributed", "local"], |
94 | | - default=None, |
95 | | - nargs="?", |
96 | | - ) |
97 | | - parser.add_argument( |
98 | | - "--cmd_section_name", |
99 | | - type=str, |
100 | | - help="The section name passed to llvm-objcopy. For ELF object files, the " |
101 | | - "default .llvmcmd is correct. For Mach-O object files, one should use " |
102 | | - "something like __LLVM,__cmdline", |
103 | | - default=".llvmcmd", |
104 | | - nargs="?", |
105 | | - ) |
106 | | - parser.add_argument( |
107 | | - "--bitcode_section_name", |
108 | | - type=str, |
109 | | - help="The section name passed to llvm-objcopy. For ELF object files, the " |
110 | | - "default .llvmbc is correct. For Mach-O object files, one should use " |
111 | | - "__LLVM,__bitcode", |
112 | | - default=".llvmbc", |
113 | | - nargs="?", |
114 | | - ) |
115 | | - flags.add_verbosity_arguments(parser) |
116 | | - args = parser.parse_args() |
117 | | - main(args) |
118 | | - |
119 | | - |
120 | | -def main(args): |
121 | | - logging.basicConfig(level=args.verbosity) |
122 | | - |
123 | | - objs = [] |
124 | | - if args.input is not None and args.thinlto_build == "local": |
125 | | - raise ValueError("--thinlto_build=local cannot be run with --input") |
126 | | - if args.input is None: |
127 | | - if args.thinlto_build != "local": |
128 | | - raise ValueError("--input or --thinlto_build=local must be provided") |
129 | | - objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir) |
130 | | - elif args.input_type == "json": |
131 | | - with open(args.input, encoding="utf-8") as f: |
132 | | - objs = extract_ir_lib.load_from_compile_commands( |
133 | | - json.load(f), args.output_dir |
134 | | - ) |
135 | | - elif args.input_type == "params": |
136 | | - if not args.obj_base_dir: |
137 | | - logging.info( |
138 | | - "-obj_base_dir is unspecified, assuming current directory. " |
139 | | - "If no objects are found, use this option to specify the root " |
140 | | - "directory for the object file paths in the input file." |
141 | | - ) |
142 | | - with open(args.input, encoding="utf-8") as f: |
143 | | - objs = extract_ir_lib.load_from_lld_params( |
144 | | - [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir |
145 | | - ) |
146 | | - elif args.input_type == "directory": |
147 | | - logging.warning( |
148 | | - "Using the directory input is only recommended if the build system " |
149 | | - "your project uses does not support any structured output that " |
150 | | - "ml-compiler-opt understands. If your build system provides a " |
151 | | - "structured compilation database, use that instead" |
152 | | - ) |
153 | | - objs = extract_ir_lib.load_from_directory(args.input, args.output_dir) |
154 | | - elif args.input_type == "bazel_aquery": |
155 | | - with open(args.input, encoding="utf-8") as aquery_json_handle: |
156 | | - objs = extract_ir_lib.load_bazel_aquery( |
157 | | - json.load(aquery_json_handle), args.obj_base_dir, args.output_dir |
158 | | - ) |
159 | | - else: |
160 | | - logging.error("Unknown input type: %s", args.input_type) |
161 | | - |
162 | | - relative_output_paths = extract_ir_lib.run_extraction( |
163 | | - objs, |
164 | | - args.num_workers, |
165 | | - args.llvm_objcopy_path, |
166 | | - args.cmd_filter, |
167 | | - args.thinlto_build, |
168 | | - args.cmd_section_name, |
169 | | - args.bitcode_section_name, |
170 | | - ) |
171 | | - |
172 | | - extract_ir_lib.write_corpus_manifest( |
173 | | - args.thinlto_build, relative_output_paths, args.output_dir |
174 | | - ) |
175 | | - |
176 | | - logging.info( |
177 | | - "Converted %d files out of %d", |
178 | | - len(objs) - relative_output_paths.count(None), |
179 | | - len(objs), |
180 | | - ) |
181 | | - |
182 | | - |
183 | | -if __name__ == "__main__": |
184 | | - parse_args_and_run() |
0 commit comments