@@ -105,7 +105,9 @@ def thinlto_index_file(self):
105
105
106
106
def _get_extraction_cmd_command (self , llvm_objcopy_path : str ,
107
107
cmd_section_name : str ):
108
- """Call llvm_objcopy to extract the llvmcmd section in self._cmd_file."""
108
+ """Get llvm-objcopy and process args to a produce a command string that,
109
+ when invoked, will extract the cmd section info ths self.cmd_file() file.
110
+ """
109
111
return [
110
112
llvm_objcopy_path ,
111
113
'--dump-section=' + cmd_section_name + '=' + self .cmd_file (),
@@ -114,7 +116,10 @@ def _get_extraction_cmd_command(self, llvm_objcopy_path: str,
114
116
115
117
def _get_extraction_bc_command (self , llvm_objcopy_path : str ,
116
118
bitcode_section_name : str ):
117
- """Call llvm_objcopy to extract the llvmbc section in self._bc_file."""
119
+ """Gets llvm-objcopy and process args to produce a command string that,
120
+ when invoked, will extract the bitcode section into the self.bc_file()
121
+ file.
122
+ """
118
123
return [
119
124
llvm_objcopy_path ,
120
125
'--dump-section=' + bitcode_section_name + '=' + self .bc_file (),
@@ -252,6 +257,14 @@ def make_obj(obj_file: str) -> TrainingIRExtractor:
252
257
253
258
def load_from_directory (obj_base_dir : str ,
254
259
output_dir : str ) -> List [TrainingIRExtractor ]:
260
+ """Create an object file array by globbing an entire drectory.
261
+
262
+ Args:
263
+ obj_base_dir: The base build directory that all object files will be
264
+ written out as being relative to.
265
+ output_dir: The output directory where extracted .bc and .cmd files should
266
+ be placed.
267
+ """
255
268
paths = [str (p ) for p in pathlib .Path (obj_base_dir ).glob ('**/*.o' )]
256
269
257
270
def make_spec (obj_file : str ):
@@ -292,6 +305,24 @@ def extract_artifacts(obj: TrainingIRExtractor, llvm_objcopy_path: str,
292
305
def run_extraction (objs : List [TrainingIRExtractor ], num_workers : int ,
293
306
llvm_objcopy_path : str , cmd_filter : str , thinlto_build : str ,
294
307
cmd_section_name : str , bitcode_section_name : str ):
308
+ """Extracts all specified object files into the corpus directory.
309
+
310
+ Args:
311
+ objs: A list of TrainingIRExtractor Objects that represent the object files
312
+ to extract bitcode/commands from.
313
+ num_workers: The number of parallel processes to spawn to run the
314
+ extraction.
315
+ llvm_objcopy_path: The path to the llvm-objcopy to use for dumping sections.
316
+ cmd_filter: A regular expression that is used to select for compilations
317
+ performed with specific flags. If you want to include all compilations,
318
+ set this to None.
319
+ thinlto_build: Whether or not this is a ThinLTO build, and if so, the type.
320
+ Set this to None if the build was not done with ThinLTO.
321
+ cmd_section_name: The name of the command line section created by the
322
+ bitcode embedding.
323
+ bitcode_section_name: The name of the bitcode section created by the
324
+ bitcode embedding.
325
+ """
295
326
extract_artifacts_function = functools .partial (
296
327
extract_artifacts ,
297
328
llvm_objcopy_path = llvm_objcopy_path ,
@@ -308,6 +339,18 @@ def run_extraction(objs: List[TrainingIRExtractor], num_workers: int,
308
339
309
340
def write_corpus_manifest (thinlto_build : str , relative_output_paths : List [str ],
310
341
output_dir : str ):
342
+ """Writes a corpus_manifest.json containing all necessary information about
343
+ the corpus.
344
+
345
+ Args:
346
+ thinlto_build: Whether or not the build was done with ThinLTO and if so,
347
+ what kind of ThinLTO. Set this to none if the build was not performed with
348
+ ThinLTO.
349
+ relative_output_paths: The relative (to the corpus directory) output paths
350
+ of all the bitcode files that should be placed in the corpus manifest
351
+ output_dir: The corpus directory where the corpus manifest should be
352
+ placed.
353
+ """
311
354
# This comes first rather than later so global_command_override is at the top
312
355
# of the .json after being written
313
356
if thinlto_build == 'local' :
0 commit comments