Add wrapper instead of moving

thevinster · thevinster · commit ee72352a0ec4 · 2025-07-04T11:13:02.000-07:00
diff --git a/llvm/utils/mlgo-utils/combine_training_corpus.py b/llvm/utils/mlgo-utils/combine_training_corpus.py
@@ -1,52 +1,9 @@
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-r"""Combine multiple training corpus into a single training corpus.
+#!/usr/bin/env python3
 
-Currently only support the case that multiple corpus share the same
-configurables except the "modules" field.
+import re
+import sys
+from mlgo.corpus.combine_training_corpus import parse_args_and_run
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+    sys.exit(parse_args_and_run())
 
-Usage: we'd like to combine training corpus corpus1 and corpus2 into
-combinedcorpus; we first structure the files as follows:
-
-combinedcorpus
-combinedcorpus/corpus1
-combinedcorpus/corpus2
-
-Running this script with
-
-python3 \
-compiler_opt/tools/combine_training_corpus.py \
-  --root_dir=$PATH_TO_combinedcorpus
-
-generates combinedcorpus/corpus_description.json file. In this way corpus1
-and corpus2 are combined into combinedcorpus.
-"""
-
-import argparse
-import logging
-
-from mlgo.corpus import combine_training_corpus_lib
-from mlgo.corpus import flags
-
-
-def parse_args_and_run():
-    parser = argparse.ArgumentParser(
-        description="A tool for combining multiple training corpora"
-    )
-    parser.add_argument(
-        "--root_dir", type=str, help="The root dir of module paths to combine."
-    )
-    flags.add_verbosity_arguments(parser)
-    args = parser.parse_args()
-    main(args)
-
-
-def main(args):
-    logging.basicConfig(level=args.verbosity)
-
-    combine_training_corpus_lib.combine_corpus(args.root_dir)
-
-
-if __name__ == "__main__":
-    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/extract_ir.py b/llvm/utils/mlgo-utils/extract_ir.py
@@ -1,184 +1,9 @@
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-"""Extract IR for training.
+#!/usr/bin/env python3
 
-Extract IR for training, either from a compile_commands.json file produced by
-cmake, or a linker parameter list file.
+import re
+import sys
+from mlgo.corpus.extract_ir import parse_args_and_run
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+    sys.exit(parse_args_and_run())
 
-Only run with
-'python compiler_opt/tools/extract_ir.py ...'
-
-The compilation is assumed to have been performed with clang, using
--fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
-
-In a distributed ThinLTO case, the compilation is assumed to have been performed
-specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
-
-In a local ThinLTO case, the compilation is assumedto have been performed
-specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
-
-To change the logging verbosity, set the --verbosity flag to the desired level.
-Setting it to a specific level will enable all messages at that level and
-higher. Exact values can be found by invoking the script with --help.
-"""
-
-import argparse
-import json
-import logging
-
-from mlgo.corpus import extract_ir_lib
-from mlgo.corpus import flags
-
-
-def parse_args_and_run():
-    parser = argparse.ArgumentParser(
-        description="A tool for making a corpus from build artifacts"
-    )
-    parser.add_argument(
-        "--input",
-        type=str,
-        help="Input file or directory - either compile_commands.json, a linker "
-        "parameter list, or a path to a directory containing object files.",
-    )
-    parser.add_argument(
-        "--input_type",
-        type=str,
-        help="Input file type - JSON, LLD params, directory, or bazel aquery.",
-        choices=["json", "params", "directory", "bazel_aquery"],
-        default="json",
-        nargs="?",
-    )
-    parser.add_argument("--output_dir", type=str, help="Output directory")
-    parser.add_argument(
-        "--num_workers",
-        type=int,
-        help="Number of parallel works for objcopy. `None` for maximum available.",
-        default=None,
-        nargs="?",
-    )
-    parser.add_argument(
-        "--llvm_objcopy_path",
-        type=str,
-        help="Path to llvm-objcopy",
-        default="llvm-objcopy",
-        nargs="?",
-    )
-    parser.add_argument(
-        "--obj_base_dir",
-        type=str,
-        help="Base directory for object files. Defaults to current working dir.",
-        default="",
-        nargs="?",
-    )
-    parser.add_argument(
-        "--cmd_filter",
-        type=str,
-        help="Include only those modules with a command line matching this regular "
-        "expression. Set it to None to not perform any filtering. Note that the "
-        "regular expression is applied independently for each separate command line "
-        "option. For example, ^-Oz$ will match Oz built binaries. This does not work "
-        "with thinlto_build=lld.",
-        default=None,
-        nargs="?",
-    )
-    parser.add_argument(
-        "--thinlto_build",
-        type=str,
-        help="Set if the build was performed with either 'distributed' or 'local' "
-        "ThinLTO. This ensures the thinlto.bc files are also copied. The build is "
-        "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in "
-        "the distributed case or -Wl,--save-temps=import and "
-        "-Wl,--thinlto-emit-index-files passed in the local case",
-        choices=["distributed", "local"],
-        default=None,
-        nargs="?",
-    )
-    parser.add_argument(
-        "--cmd_section_name",
-        type=str,
-        help="The section name passed to llvm-objcopy. For ELF object files, the "
-        "default .llvmcmd is correct. For Mach-O object files, one should use "
-        "something like __LLVM,__cmdline",
-        default=".llvmcmd",
-        nargs="?",
-    )
-    parser.add_argument(
-        "--bitcode_section_name",
-        type=str,
-        help="The section name passed to llvm-objcopy. For ELF object files, the "
-        "default .llvmbc is correct. For Mach-O object files, one should use "
-        "__LLVM,__bitcode",
-        default=".llvmbc",
-        nargs="?",
-    )
-    flags.add_verbosity_arguments(parser)
-    args = parser.parse_args()
-    main(args)
-
-
-def main(args):
-    logging.basicConfig(level=args.verbosity)
-
-    objs = []
-    if args.input is not None and args.thinlto_build == "local":
-        raise ValueError("--thinlto_build=local cannot be run with --input")
-    if args.input is None:
-        if args.thinlto_build != "local":
-            raise ValueError("--input or --thinlto_build=local must be provided")
-        objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir)
-    elif args.input_type == "json":
-        with open(args.input, encoding="utf-8") as f:
-            objs = extract_ir_lib.load_from_compile_commands(
-                json.load(f), args.output_dir
-            )
-    elif args.input_type == "params":
-        if not args.obj_base_dir:
-            logging.info(
-                "-obj_base_dir is unspecified, assuming current directory. "
-                "If no objects are found, use this option to specify the root "
-                "directory for the object file paths in the input file."
-            )
-        with open(args.input, encoding="utf-8") as f:
-            objs = extract_ir_lib.load_from_lld_params(
-                [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir
-            )
-    elif args.input_type == "directory":
-        logging.warning(
-            "Using the directory input is only recommended if the build system "
-            "your project uses does not support any structured output that "
-            "ml-compiler-opt understands. If your build system provides a "
-            "structured compilation database, use that instead"
-        )
-        objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
-    elif args.input_type == "bazel_aquery":
-        with open(args.input, encoding="utf-8") as aquery_json_handle:
-            objs = extract_ir_lib.load_bazel_aquery(
-                json.load(aquery_json_handle), args.obj_base_dir, args.output_dir
-            )
-    else:
-        logging.error("Unknown input type: %s", args.input_type)
-
-    relative_output_paths = extract_ir_lib.run_extraction(
-        objs,
-        args.num_workers,
-        args.llvm_objcopy_path,
-        args.cmd_filter,
-        args.thinlto_build,
-        args.cmd_section_name,
-        args.bitcode_section_name,
-    )
-
-    extract_ir_lib.write_corpus_manifest(
-        args.thinlto_build, relative_output_paths, args.output_dir
-    )
-
-    logging.info(
-        "Converted %d files out of %d",
-        len(objs) - relative_output_paths.count(None),
-        len(objs),
-    )
-
-
-if __name__ == "__main__":
-    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py
@@ -1,53 +1,9 @@
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-"""Tool for making a corpus from arbitrary bitcode.
+#!/usr/bin/env python3
 
-To create a corpus from a set of bitcode files in an input directory, run
-the following command:
+import re
+import sys
+from mlgo.corpus.make_corpus import parse_args_and_run
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+    sys.exit(parse_args_and_run())
 
-PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \
-  --input_dir=<path to input directory> \
-  --output_dir=<path to output directory> \
-  --default_args="<list of space separated flags>"
-"""
-
-import argparse
-import logging
-
-from mlgo.corpus import make_corpus_lib
-
-
-def parse_args_and_run():
-    parser = argparse.ArgumentParser(
-        description="A tool for making a corpus from arbitrary bitcode"
-    )
-    parser.add_argument("--input_dir", type=str, help="The input directory.")
-    parser.add_argument("--output_dir", type=str, help="The output directory.")
-    parser.add_argument(
-        "--default_args",
-        type=str,
-        help="The compiler flags to compile with when using downstream tooling.",
-        default="",
-        nargs="?",
-    )
-    args = parser.parse_args()
-    main(args)
-
-
-def main(args):
-    logging.warning(
-        "Using this tool does not guarantee that the bitcode is taken at "
-        "the correct stage for consumption during model training. Make "
-        "sure to validate assumptions about where the bitcode is coming "
-        "from before using it in production."
-    )
-    relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir)
-    make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir)
-    make_corpus_lib.write_corpus_manifest(
-        relative_paths, args.output_dir, args.default_args.split()
-    )
-
-
-if __name__ == "__main__":
-    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
@@ -0,0 +1,52 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+r"""Combine multiple training corpus into a single training corpus.
+
+Currently only support the case that multiple corpus share the same
+configurables except the "modules" field.
+
+Usage: we'd like to combine training corpus corpus1 and corpus2 into
+combinedcorpus; we first structure the files as follows:
+
+combinedcorpus
+combinedcorpus/corpus1
+combinedcorpus/corpus2
+
+Running this script with
+
+python3 \
+compiler_opt/tools/combine_training_corpus.py \
+  --root_dir=$PATH_TO_combinedcorpus
+
+generates combinedcorpus/corpus_description.json file. In this way corpus1
+and corpus2 are combined into combinedcorpus.
+"""
+
+import argparse
+import logging
+
+from mlgo.corpus import combine_training_corpus_lib
+from mlgo.corpus import flags
+
+
+def parse_args_and_run():
+    parser = argparse.ArgumentParser(
+        description="A tool for combining multiple training corpora"
+    )
+    parser.add_argument(
+        "--root_dir", type=str, help="The root dir of module paths to combine."
+    )
+    flags.add_verbosity_arguments(parser)
+    args = parser.parse_args()
+    main(args)
+
+
+def main(args):
+    logging.basicConfig(level=args.verbosity)
+
+    combine_training_corpus_lib.combine_corpus(args.root_dir)
+
+
+if __name__ == "__main__":
+    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py