32
32
33
33
import json
34
34
import multiprocessing
35
- import os
36
- import pathlib
37
- import re
38
- import shutil
39
- import subprocess
40
-
41
- from typing import Dict , List , Optional
42
35
43
36
from absl import app
44
37
from absl import flags
45
38
from absl import logging
46
39
47
- from compiler_opt .rl import constant
40
+ from compiler_opt .tools import extract_ir_lib
48
41
49
42
flags .DEFINE_string (
50
43
'input' , None ,
91
84
FLAGS = flags .FLAGS
92
85
93
86
94
- # TODO(ml-compiler-opt): maybe we can also convert here the cmdline file,from a
95
- # \0 - separated list of strings, to a \n one.
96
- def should_include_module (cmdline : str , match_regexp : Optional [str ]) -> bool :
97
- """Determine if the module should be included."""
98
- if match_regexp is None :
99
- return True
100
- lines = cmdline .split ('\0 ' )
101
- return any (len (re .findall (match_regexp , l )) for l in lines )
102
-
103
-
104
- def get_thinlto_index (cmdline : str , basedir : str ) -> Optional [str ]:
105
- opts = cmdline .split ('\0 ' )
106
- for option in opts :
107
- if option .startswith ('-fthinlto-index' ):
108
- return os .path .join (basedir , option .split ('=' )[1 ])
109
- return None
110
-
111
-
112
- class TrainingIRExtractor :
113
- """IR and command line extraction from an object file."""
114
-
115
- def __init__ (self , obj_relative_path , output_base_dir , obj_base_dir = None ):
116
- """Set up a TrainingIRExtractor.
117
-
118
- Args:
119
- obj_relative_path: relative path to the input object file. It will be also
120
- used to construct the absolute path of the output IR and cmd files, by
121
- appending it to output_base_dir.
122
- output_base_dir: the directory under which the output will be produced.
123
- obj_base_dir: the base directory for all the input object files.
124
- """
125
- self ._obj_relative_path = obj_relative_path
126
- self ._output_base_dir = output_base_dir
127
- self ._obj_base_dir = obj_base_dir if obj_base_dir is not None else ''
128
-
129
- def obj_base_dir (self ):
130
- return self ._obj_base_dir
131
-
132
- def output_base_dir (self ):
133
- return self ._output_base_dir
134
-
135
- def relative_output_path (self ):
136
- return self ._obj_relative_path
137
-
138
- def input_obj (self ):
139
- return os .path .join (self .obj_base_dir (), self ._obj_relative_path )
140
-
141
- def lld_src_bc (self ):
142
- # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
143
- # IR bitcode saved by lld. It is hardcoded into lld.
144
- return os .path .join (self ._obj_base_dir ,
145
- self ._obj_relative_path + '.3.import.bc' )
146
-
147
- def lld_src_thinlto (self ):
148
- return os .path .join (self ._obj_base_dir ,
149
- self ._obj_relative_path + '.thinlto.bc' )
150
-
151
- def dest_dir (self ):
152
- return os .path .join (self .output_base_dir (),
153
- os .path .dirname (self ._obj_relative_path ))
154
-
155
- def module_name (self ):
156
- return os .path .basename (self ._obj_relative_path )
157
-
158
- def cmd_file (self ):
159
- return os .path .join (self .dest_dir (), self .module_name () + '.cmd' )
160
-
161
- def bc_file (self ):
162
- return os .path .join (self .dest_dir (), self .module_name () + '.bc' )
163
-
164
- def thinlto_index_file (self ):
165
- return os .path .join (self .dest_dir (), self .module_name () + '.thinlto.bc' )
166
-
167
- def _get_extraction_cmd_command (self , llvm_objcopy_path ):
168
- """Call llvm_objcopy to extract the llvmcmd section in self._cmd_file."""
169
- return [
170
- llvm_objcopy_path ,
171
- '--dump-section=' + FLAGS .cmd_section_name + '=' + self .cmd_file (),
172
- self .input_obj (), '/dev/null'
173
- ]
174
-
175
- def _get_extraction_bc_command (self , llvm_objcopy_path ):
176
- """Call llvm_objcopy to extract the llvmbc section in self._bc_file."""
177
- return [
178
- llvm_objcopy_path ,
179
- '--dump-section=' + FLAGS .bitcode_section_name + '=' + self .bc_file (),
180
- self .input_obj (), '/dev/null'
181
- ]
182
-
183
- def _extract_clang_artifacts (self , llvm_objcopy_path : str , cmd_filter : str ,
184
- is_thinlto : bool ) -> Optional [str ]:
185
- """Run llvm-objcopy to extract the .bc and command line."""
186
- if not os .path .exists (self .input_obj ()):
187
- logging .info ('%s does not exist.' , self .input_obj ())
188
- return None
189
- os .makedirs (self .dest_dir (), exist_ok = True )
190
- try :
191
- subprocess .run (
192
- self ._get_extraction_cmd_command (llvm_objcopy_path ), check = True )
193
- if cmd_filter is not None or is_thinlto :
194
- with open (self .cmd_file (), encoding = 'utf-8' ) as f :
195
- lines = f .readlines ()
196
- assert len (lines ) == 1
197
- cmdline = lines [0 ]
198
- if not should_include_module (cmdline , cmd_filter ):
199
- logging .info (
200
- 'Excluding module %s because it does not match the filter' ,
201
- self .input_obj ())
202
- os .remove (self .cmd_file ())
203
- return None
204
- if is_thinlto :
205
- index_file = get_thinlto_index (cmdline , self .obj_base_dir ())
206
- shutil .copy (index_file , self .thinlto_index_file ())
207
-
208
- subprocess .run (
209
- self ._get_extraction_bc_command (llvm_objcopy_path ), check = True )
210
- except subprocess .CalledProcessError as e :
211
- # This may happen if .o file was build from asm (.S source).
212
- logging .warning ('%s was not processed: %s' , self .input_obj (), e )
213
- return None
214
- assert (os .path .exists (self .cmd_file ()) and
215
- os .path .exists (self .bc_file ()) and
216
- (not is_thinlto or os .path .exists (self .thinlto_index_file ())))
217
- return self .relative_output_path ()
218
-
219
- def _extract_lld_artifacts (self ) -> Optional [str ]:
220
- """Extract the .bc file with ThinLTO index from an lld ThinLTO invocation.
221
- """
222
- if not os .path .exists (self .lld_src_bc ()):
223
- logging .info ('%s does not exist.' , self .lld_src_bc ())
224
- return None
225
- if not os .path .exists (self .lld_src_thinlto ()):
226
- logging .info ('%s does not exist.' , self .lld_src_thinlto ())
227
- return None
228
- os .makedirs (self .dest_dir (), exist_ok = True )
229
-
230
- # Copy over the files
231
- shutil .copy (self .lld_src_bc (), self .bc_file ())
232
- shutil .copy (self .lld_src_thinlto (), self .thinlto_index_file ())
233
-
234
- assert os .path .exists (self .bc_file ())
235
- assert os .path .exists (self .thinlto_index_file ())
236
- return self ._obj_relative_path
237
-
238
- def extract (self ,
239
- llvm_objcopy_path : Optional [str ] = None ,
240
- cmd_filter : Optional [str ] = None ,
241
- thinlto_build : Optional [str ] = None ) -> Optional [str ]:
242
- if thinlto_build == 'local' :
243
- return self ._extract_lld_artifacts ()
244
- return self ._extract_clang_artifacts (
245
- llvm_objcopy_path = llvm_objcopy_path ,
246
- cmd_filter = cmd_filter ,
247
- is_thinlto = thinlto_build == 'distributed' )
248
-
249
-
250
- def convert_compile_command_to_objectfile (
251
- command : Dict [str , str ], output_dir : str ) -> Optional [TrainingIRExtractor ]:
252
- obj_base_dir = command ['directory' ]
253
- cmd = command ['command' ]
254
-
255
- cmd_parts = cmd .split ()
256
- try :
257
- obj_index = cmd_parts .index ('-o' ) + 1
258
- except ValueError :
259
- # This could happen if there are non-clang commands in compile_commands.json
260
- logging .info ('Command has no -o option: %s' , cmd )
261
- return None
262
- obj_rel_path = cmd_parts [obj_index ]
263
- # TODO(mtrofin): is the obj_base_dir correct for thinlto index bc files?
264
- return TrainingIRExtractor (
265
- obj_relative_path = obj_rel_path ,
266
- output_base_dir = output_dir ,
267
- obj_base_dir = obj_base_dir )
268
-
269
-
270
- def load_from_compile_commands (json_array : List [Dict [str , str ]],
271
- output_dir : str ) -> List [TrainingIRExtractor ]:
272
- objs = [
273
- convert_compile_command_to_objectfile (cmd , output_dir )
274
- for cmd in json_array
275
- ]
276
- # Filter out None, in case there were non-clang commands in the .json
277
- return [obj for obj in objs if obj is not None ]
278
-
279
-
280
- def load_from_lld_params (params_array : List [str ], obj_base_dir : str ,
281
- output_dir : str ) -> List [TrainingIRExtractor ]:
282
- """Create an ObjectFile array based on lld's parameters."""
283
- # yank out -o and the output. After that, anything not starting with '-', and
284
- # ending in a '.o', is an object file.
285
- try :
286
- minus_o_idx = params_array .index ('-o' )
287
- del params_array [minus_o_idx :minus_o_idx + 2 ]
288
- just_obj_paths = [
289
- o for o in params_array if not o .startswith ('-' ) and o .endswith ('.o' )
290
- ]
291
- except ValueError :
292
- logging .info ('This params file does not have an explicit -o option.' )
293
- just_obj_paths = params_array
294
-
295
- def make_obj (obj_file : str ) -> TrainingIRExtractor :
296
- return TrainingIRExtractor (
297
- obj_relative_path = obj_file ,
298
- output_base_dir = output_dir ,
299
- obj_base_dir = obj_base_dir )
300
-
301
- return [make_obj (obj_file ) for obj_file in just_obj_paths ]
302
-
303
-
304
- def load_for_lld_thinlto (obj_base_dir : str ,
305
- output_dir : str ) -> List [TrainingIRExtractor ]:
306
- # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
307
- # IR bitcode saved by lld. It is hardcoded into lld. ThinLTO index files
308
- # are also emitted next to the postimport bitcode, with the suffix
309
- # .thinlto.bc instead
310
- paths = [str (p ) for p in pathlib .Path (obj_base_dir ).glob ('**/*.3.import.bc' )]
311
-
312
- def make_spec (obj_file : str ):
313
- return TrainingIRExtractor (
314
- # Cut away .3.import.bc
315
- obj_relative_path = os .path .relpath (obj_file , start = obj_base_dir )[:- 12 ],
316
- output_base_dir = output_dir ,
317
- obj_base_dir = obj_base_dir )
318
-
319
- return [make_spec (path ) for path in paths ]
320
-
321
-
322
- # This is here just for readability, lint complains if the pooling expression is
323
- # over 3 lines; and it needs to be a non-local so it may be pickled.
324
- def extract_artifacts (obj : TrainingIRExtractor ) -> Optional [str ]:
325
- return obj .extract (FLAGS .llvm_objcopy_path , FLAGS .cmd_filter ,
326
- FLAGS .thinlto_build )
327
-
328
-
329
87
def main (argv ):
330
88
if len (argv ) > 1 :
331
89
raise app .UsageError ('Too many command-line arguments.' )
@@ -336,49 +94,34 @@ def main(argv):
336
94
if FLAGS .input is None :
337
95
if FLAGS .thinlto_build != 'local' :
338
96
raise ValueError ('--input or --thinlto_build=local must be provided' )
339
- objs = load_for_lld_thinlto (FLAGS .obj_base_dir , FLAGS .output_dir )
97
+ objs = extract_ir_lib .load_for_lld_thinlto (FLAGS .obj_base_dir ,
98
+ FLAGS .output_dir )
340
99
elif FLAGS .input_type == 'json' :
341
100
with open (FLAGS .input , encoding = 'utf-8' ) as f :
342
- objs = load_from_compile_commands (json .load (f ), FLAGS .output_dir )
101
+ objs = extract_ir_lib .load_from_compile_commands (
102
+ json .load (f ), FLAGS .output_dir )
343
103
elif FLAGS .input_type == 'params' :
344
104
if not FLAGS .obj_base_dir :
345
105
logging .info (
346
106
'-obj_base_dir is unspecified, assuming current directory.'
347
107
'If no objects are found, use this option to specify the root'
348
108
'directory for the object file paths in the input file.' )
349
109
with open (FLAGS .input , encoding = 'utf-8' ) as f :
350
- objs = load_from_lld_params ([l .strip () for l in f .readlines ()],
351
- FLAGS .obj_base_dir , FLAGS .output_dir )
110
+ objs = extract_ir_lib .load_from_lld_params (
111
+ [l .strip () for l in f .readlines ()], FLAGS .obj_base_dir ,
112
+ FLAGS .output_dir )
352
113
else :
353
114
logging .error ('Unknown input type: %s' , FLAGS .input_type )
354
115
355
- with multiprocessing .Pool (FLAGS .num_workers ) as pool :
356
- relative_output_paths = pool .map (extract_artifacts , objs )
357
- pool .close ()
358
- pool .join ()
359
-
360
- # This comes first rather than later so global_command_override is at the top
361
- # of the .json after being written
362
- if FLAGS .thinlto_build == 'local' :
363
- corpus_description = {
364
- 'global_command_override' : constant .UNSPECIFIED_OVERRIDE
365
- }
366
- else :
367
- corpus_description = {}
368
-
369
- corpus_description .update ({
370
- 'has_thinlto' : FLAGS .thinlto_build is not None ,
371
- 'modules' : [path for path in relative_output_paths if path is not None ]
372
- })
116
+ relative_output_paths = extract_ir_lib .run_extraction (
117
+ objs , FLAGS .num_workers , FLAGS .llvm_objcopy_path , FLAGS .cmd_filter ,
118
+ FLAGS .thinlto_build , FLAGS .cmd_section_name , FLAGS .bitcode_section_name )
373
119
374
- with open (
375
- os .path .join (FLAGS .output_dir , 'corpus_description.json' ),
376
- 'w' ,
377
- encoding = 'utf-8' ) as f :
378
- json .dump (corpus_description , f , indent = 2 )
120
+ extract_ir_lib .write_corpus_manifest (FLAGS .thinlto_build ,
121
+ relative_output_paths , FLAGS .output_dir )
379
122
380
- logging .info ('Converted %d files out of %d' ,
381
- len (objs ) - relative_output_paths .count (None ), len (objs ))
123
+ logging .info ('Converted %d files out of %d' ,
124
+ len (objs ) - relative_output_paths .count (None ), len (objs ))
382
125
383
126
384
127
if __name__ == '__main__' :
0 commit comments