Skip to content

Commit ceec562

Browse files
Add docstrings to extract_ir_lib.py
Now that extract_ir_lib.py is being used in other projects through the Pip package rather than just the ml-compiler-opt CLI tooling, it would be good to have better documentation on how to use these functions. This commit adds in doc strings for some of the functions that are most likely to be used in downstream projects and fixes up some other docstrings. This was motivated by not passing an empty string instead of None to cmd_filter which led to some annoying debugging when other issues were at play.
1 parent 94f030b commit ceec562

File tree

1 file changed

+45
-2
lines changed

1 file changed

+45
-2
lines changed

compiler_opt/tools/extract_ir_lib.py

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,9 @@ def thinlto_index_file(self):
105105

106106
def _get_extraction_cmd_command(self, llvm_objcopy_path: str,
107107
cmd_section_name: str):
108-
"""Call llvm_objcopy to extract the llvmcmd section in self._cmd_file."""
108+
"""Get llvm-objcopy and process args to a produce a command string that,
109+
when invoked, will extract the cmd section info ths self.cmd_file() file.
110+
"""
109111
return [
110112
llvm_objcopy_path,
111113
'--dump-section=' + cmd_section_name + '=' + self.cmd_file(),
@@ -114,7 +116,10 @@ def _get_extraction_cmd_command(self, llvm_objcopy_path: str,
114116

115117
def _get_extraction_bc_command(self, llvm_objcopy_path: str,
116118
bitcode_section_name: str):
117-
"""Call llvm_objcopy to extract the llvmbc section in self._bc_file."""
119+
"""Gets llvm-objcopy and process args to produce a command string that,
120+
when invoked, will extract the bitcode section into the self.bc_file()
121+
file.
122+
"""
118123
return [
119124
llvm_objcopy_path,
120125
'--dump-section=' + bitcode_section_name + '=' + self.bc_file(),
@@ -252,6 +257,14 @@ def make_obj(obj_file: str) -> TrainingIRExtractor:
252257

253258
def load_from_directory(obj_base_dir: str,
254259
output_dir: str) -> List[TrainingIRExtractor]:
260+
"""Create an object file array by globbing an entire drectory.
261+
262+
Args:
263+
obj_base_dir: The base build directory that all object files will be
264+
written out as being relative to.
265+
output_dir: The output directory where extracted .bc and .cmd files should
266+
be placed.
267+
"""
255268
paths = [str(p) for p in pathlib.Path(obj_base_dir).glob('**/*.o')]
256269

257270
def make_spec(obj_file: str):
@@ -292,6 +305,24 @@ def extract_artifacts(obj: TrainingIRExtractor, llvm_objcopy_path: str,
292305
def run_extraction(objs: List[TrainingIRExtractor], num_workers: int,
293306
llvm_objcopy_path: str, cmd_filter: str, thinlto_build: str,
294307
cmd_section_name: str, bitcode_section_name: str):
308+
"""Extracts all specified object files into the corpus directory.
309+
310+
Args:
311+
objs: A list of TrainingIRExtractor Objects that represent the object files
312+
to extract bitcode/commands from.
313+
num_workers: The number of parallel processes to spawn to run the
314+
extraction.
315+
llvm_objcopy_path: The path to the llvm-objcopy to use for dumping sections.
316+
cmd_filter: A regular expression that is used to select for compilations
317+
performed with specific flags. If you want to include all compilations,
318+
set this to None.
319+
thinlto_build: Whether or not this is a ThinLTO build, and if so, the type.
320+
Set this to None if the build was not done with ThinLTO.
321+
cmd_section_name: The name of the command line section created by the
322+
bitcode embedding.
323+
bitcode_section_name: The name of the bitcode section created by the
324+
bitcode embedding.
325+
"""
295326
extract_artifacts_function = functools.partial(
296327
extract_artifacts,
297328
llvm_objcopy_path=llvm_objcopy_path,
@@ -308,6 +339,18 @@ def run_extraction(objs: List[TrainingIRExtractor], num_workers: int,
308339

309340
def write_corpus_manifest(thinlto_build: str, relative_output_paths: List[str],
310341
output_dir: str):
342+
"""Writes a corpus_manifest.json containing all necessary information about
343+
the corpus.
344+
345+
Args:
346+
thinlto_build: Whether or not the build was done with ThinLTO and if so,
347+
what kind of ThinLTO. Set this to none if the build was not performed with
348+
ThinLTO.
349+
relative_output_paths: The relative (to the corpus directory) output paths
350+
of all the bitcode files that should be placed in the corpus manifest
351+
output_dir: The corpus directory where the corpus manifest should be
352+
placed.
353+
"""
311354
# This comes first rather than later so global_command_override is at the top
312355
# of the .json after being written
313356
if thinlto_build == 'local':

0 commit comments

Comments
 (0)