Enable running codemodder as a library import (#879)

clavedeluna · drdavella · web-flow · commit 1361e68c81bb · 2024-10-15T17:44:11.000Z
* First steps towards refactoring as a library

* Make sure no threads are used for maxworkers=1 case

* Fix up some types and defaults

* organize run args

* run returns output

* make output a Path

* make dry run required

* fix sast only filtering

---------

Co-authored-by: Daniel D'Avella &lt;dan.davella@pixee.ai&gt;
diff --git a/README.md b/README.md
@@ -30,9 +30,11 @@ To install the package from source, use `pip`:
 $ pip install /path/to/codemodder-python
 ```
 
-## Running Locally
+## Running `codemodder`
 
-The codemodder package provides an executable called `codemodder`. This should be available on your path by default after installation.
+### CLI
+
+Codemodder can be run as a CLI. The codemodder package provides an executable called `codemodder`. This should be available on your path by default after installation.
 
 For basic usage, run the `codemodder` command with a target directory path:
 
@@ -55,6 +57,19 @@ For a full list of options, use the `--help` flag:
 $ codemodder --help
 ```
 
+### Library
+
+You can also run `codemodder` as a library by importing the module and running `run`. For basic usage, pass a target directory path and the `dry_run` argument:
+
+```python
+import codemodder
+
+output, exit_code = codemodder.run("/path/to/my-project", dry_run=True)
+```
+
+Unlike the CLI which has a default `dry_run` of `False`, when calling `codemodder` as a library you must indicate if you want `codemodder` to make changes to your files.
+
+
 ## Architecture
 
 Codemods are composed of the following key components:
diff --git a/src/codemodder/__init__.py b/src/codemodder/__init__.py
@@ -2,3 +2,7 @@
     from ._version import __version__
 except ImportError:  # pragma: no cover
     __version__ = "unknown"
+
+from codemodder.codemodder import run
+
+__all__ = ["run", "__version__"]
diff --git a/src/codemodder/cli.py b/src/codemodder/cli.py
@@ -121,6 +121,7 @@ def parse_args(argv, codemod_registry: CodemodRegistry):
     parser.add_argument(
         "--dry-run",
         action=argparse.BooleanOptionalAction,
+        default=False,
         help="do everything except make changes to files",
     )
     parser.add_argument(
diff --git a/src/codemodder/codemodder.py b/src/codemodder/codemodder.py
@@ -3,6 +3,7 @@
 import logging
 import os
 import sys
+from collections import defaultdict
 from pathlib import Path
 from typing import DefaultDict, Sequence
 
@@ -14,7 +15,13 @@
 from codemodder.context import CodemodExecutionContext
 from codemodder.dependency import Dependency
 from codemodder.llm import MisconfiguredAIClient
-from codemodder.logging import configure_logger, log_list, log_section, logger
+from codemodder.logging import (
+    OutputFormat,
+    configure_logger,
+    log_list,
+    log_section,
+    logger,
+)
 from codemodder.project_analysis.file_parsers.package_store import PackageStore
 from codemodder.project_analysis.python_repo_manager import PythonRepoManager
 from codemodder.result import ResultSet
@@ -45,7 +52,7 @@ def find_semgrep_results(
     return run_semgrep(context, yaml_files, files_to_analyze)
 
 
-def log_report(context, argv, elapsed_ms, files_to_analyze):
+def log_report(context, output, elapsed_ms, files_to_analyze):
     log_section("report")
     logger.info("scanned: %s files", len(files_to_analyze))
     all_failures = context.get_failed_files()
@@ -60,7 +67,7 @@ def log_report(context, argv, elapsed_ms, files_to_analyze):
         len(all_changes),
         len(set(all_changes)),
     )
-    logger.info("report file: %s", argv.output)
+    logger.info("report file: %s", output)
     logger.info("total elapsed: %s ms", elapsed_ms)
     logger.info("  semgrep:     %s ms", context.timer.get_time_ms("semgrep"))
     logger.info("  parse:       %s ms", context.timer.get_time_ms("parse"))
@@ -111,79 +118,79 @@ def record_dependency_update(dependency_results: dict[Dependency, PackageStore |
             logger.debug("The following dependencies could not be added: %s", str_list)
 
 
-def run(original_args) -> int:
+def run(
+    directory: Path | str,
+    dry_run: bool,
+    output: Path | str | None = None,
+    output_format: str = "codetf",
+    verbose: bool = False,
+    log_format: OutputFormat = OutputFormat.JSON,
+    project_name: str | None = None,
+    tool_result_files_map: DefaultDict[str, list[str]] = defaultdict(list),
+    path_include: list[str] | None = None,
+    path_exclude: list[str] | None = None,
+    codemod_include: list[str] | None = None,
+    codemod_exclude: list[str] | None = None,
+    max_workers: int = 1,
+    original_cli_args: list[str] | None = None,
+    codemod_registry: registry.CodemodRegistry | None = None,
+    sast_only: bool = False,
+) -> tuple[CodeTF | None, int]:
     start = datetime.datetime.now()
 
-    codemod_registry = registry.load_registered_codemods()
-    provider_registry = providers.load_providers()
+    codemod_registry = codemod_registry or registry.load_registered_codemods()
 
-    # A little awkward, but we need the codemod registry in order to validate potential arguments
-    argv = parse_args(original_args, codemod_registry)
-    if not os.path.exists(argv.directory):
-        logger.error(
-            "given directory '%s' doesn't exist or can’t be read",
-            argv.directory,
-        )
-        return 1
+    path_include = path_include or []
+    path_exclude = path_exclude or []
+    codemod_include = codemod_include or []
+    codemod_exclude = codemod_exclude or []
+
+    provider_registry = providers.load_providers()
 
-    configure_logger(argv.verbose, argv.log_format, argv.project_name)
+    configure_logger(verbose, log_format, project_name)
 
     log_section("startup")
     logger.info("codemodder: python/%s", __version__)
-    logger.info("command: %s %s", Path(sys.argv[0]).name, " ".join(original_args))
-
-    try:
-        # TODO: this should be dict[str, list[Path]]
-        tool_result_files_map: DefaultDict[str, list[str]] = detect_sarif_tools(
-            [Path(name) for name in argv.sarif or []]
-        )
-    except (DuplicateToolError, FileNotFoundError) as err:
-        logger.error(err)
-        return 1
-
-    tool_result_files_map["sonar"].extend(argv.sonar_issues_json or [])
-    tool_result_files_map["sonar"].extend(argv.sonar_hotspots_json or [])
-    tool_result_files_map["defectdojo"] = argv.defectdojo_findings_json or []
 
     for file_name in itertools.chain(*tool_result_files_map.values()):
         if not os.path.exists(file_name):
             logger.error(
                 f"FileNotFoundError: [Errno 2] No such file or directory: '{file_name}'"
             )
-            return 1
+            return None, 1
 
-    repo_manager = PythonRepoManager(Path(argv.directory))
+    repo_manager = PythonRepoManager(Path(directory))
 
     try:
         context = CodemodExecutionContext(
-            Path(argv.directory),
-            argv.dry_run,
-            argv.verbose,
+            Path(directory),
+            dry_run,
+            verbose,
             codemod_registry,
             provider_registry,
             repo_manager,
-            argv.path_include,
-            argv.path_exclude,
+            path_include,
+            path_exclude,
             tool_result_files_map,
-            argv.max_workers,
+            max_workers,
         )
     except MisconfiguredAIClient as e:
         logger.error(e)
-        return 3  # Codemodder instructions conflicted (according to spec)
+        return None, 3  # Codemodder instructions conflicted (according to spec)
 
-    repo_manager.parse_project()
+    context.repo_manager.parse_project()
 
     # TODO: this should be a method of CodemodExecutionContext
     codemods_to_run = codemod_registry.match_codemods(
-        argv.codemod_include,
-        argv.codemod_exclude,
-        sast_only=argv.sonar_issues_json or argv.sarif,
+        codemod_include,
+        codemod_exclude,
+        sast_only=sast_only,
     )
 
     log_section("setup")
     log_list(logging.INFO, "running", codemods_to_run, predicate=lambda c: c.id)
     log_list(logging.INFO, "including paths", context.included_paths)
-    log_list(logging.INFO, "excluding paths", argv.path_exclude)
+    log_list(logging.INFO, "excluding paths", path_exclude)
 
     log_list(
         logging.DEBUG, "matched files", (str(path) for path in context.files_to_analyze)
@@ -203,24 +210,71 @@ def run(original_args) -> int:
     elapsed = datetime.datetime.now() - start
     elapsed_ms = int(elapsed.total_seconds() * 1000)
 
-    if argv.output:
-        codetf = CodeTF.build(
-            context,
-            elapsed_ms,
-            original_args,
-            context.compile_results(codemods_to_run),
-        )
-        codetf.write_report(argv.output)
+    logger.debug("Output format %s", output_format)
+    codetf = CodeTF.build(
+        context,
+        elapsed_ms,
+        original_cli_args or [],
+        context.compile_results(codemods_to_run),
+    )
+    if output:
+        codetf.write_report(output)
 
     log_report(
         context,
-        argv,
+        output,
         elapsed_ms,
         [] if not codemods_to_run else context.files_to_analyze,
     )
-    return 0
+    return codetf, 0
+
+
+def _run_cli(original_args) -> int:
+    codemod_registry = registry.load_registered_codemods()
+    argv = parse_args(original_args, codemod_registry)
+    if not os.path.exists(argv.directory):
+        logger.error(
+            "given directory '%s' doesn't exist or can’t be read",
+            argv.directory,
+        )
+        return 1
+
+    try:
+        # TODO: this should be dict[str, list[Path]]
+        tool_result_files_map: DefaultDict[str, list[str]] = detect_sarif_tools(
+            [Path(name) for name in argv.sarif or []]
+        )
+    except (DuplicateToolError, FileNotFoundError) as err:
+        logger.error(err)
+        return 1
+
+    tool_result_files_map["sonar"].extend(argv.sonar_issues_json or [])
+    tool_result_files_map["sonar"].extend(argv.sonar_hotspots_json or [])
+    tool_result_files_map["defectdojo"].extend(argv.defectdojo_findings_json or [])
+
+    logger.info("command: %s %s", Path(sys.argv[0]).name, " ".join(original_args))
+
+    _, status = run(
+        argv.directory,
+        argv.dry_run,
+        argv.output,
+        argv.output_format,
+        argv.verbose,
+        argv.log_format,
+        argv.project_name,
+        tool_result_files_map,
+        argv.path_include,
+        argv.path_exclude,
+        argv.codemod_include,
+        argv.codemod_exclude,
+        max_workers=argv.max_workers,
+        original_cli_args=original_args,
+        codemod_registry=codemod_registry,
+        sast_only=argv.sonar_issues_json or argv.sarif,
+    )
+    return status
 
 
 def main():
     sys_argv = sys.argv[1:]
-    sys.exit(run(sys_argv))
+    sys.exit(_run_cli(sys_argv))
diff --git a/src/codemodder/codemods/base_codemod.py b/src/codemodder/codemods/base_codemod.py
@@ -229,10 +229,15 @@ def _apply(
             self._process_file, context=context, results=results, rules=rules
         )
 
-        with ThreadPoolExecutor() as executor:
-            logger.debug("using executor with %s workers", context.max_workers)
-            contexts = executor.map(process_file, files_to_analyze)
-            executor.shutdown(wait=True)
+        contexts = []
+        if context.max_workers == 1:
+            logger.debug("processing files serially")
+            contexts.extend([process_file(file) for file in files_to_analyze])
+        else:
+            with ThreadPoolExecutor() as executor:
+                logger.debug("using executor with %s workers", context.max_workers)
+                contexts.extend(executor.map(process_file, files_to_analyze))
+                executor.shutdown(wait=True)
 
         context.process_results(self.id, contexts)
 
diff --git a/src/codemodder/codetf.py b/src/codemodder/codetf.py
@@ -9,6 +9,7 @@
 import os
 import sys
 from enum import Enum
+from pathlib import Path
 from typing import TYPE_CHECKING, Optional
 
 from pydantic import BaseModel, model_validator
@@ -165,7 +166,7 @@ def build(
         cls,
         context: CodemodExecutionContext,
         elapsed_ms,
-        original_args,
+        original_args: list,
         results: list[Result],
     ):
         command_name = os.path.basename(sys.argv[0])
@@ -183,10 +184,9 @@ def build(
         )
         return cls(run=run, results=results)
 
-    def write_report(self, outfile):
+    def write_report(self, outfile: Path | str):
         try:
-            with open(outfile, "w", encoding="utf-8") as f:
-                f.write(self.model_dump_json(exclude_none=True))
+            Path(outfile).write_text(self.model_dump_json(exclude_none=True))
         except Exception:
             logger.exception("failed to write report file.")
             # Any issues with writing the output file should exit status 2.
diff --git a/src/codemodder/context.py b/src/codemodder/context.py
diff --git a/tests/test_codemodder.py b/tests/test_codemodder.py

Original file line number	Diff line number	Diff line change
`@@ -121,6 +121,7 @@ def parse_args(argv, codemod_registry: CodemodRegistry):`
`121`	`121`	`parser.add_argument(`
`122`	`122`	`"--dry-run",`
`123`	`123`	`action=argparse.BooleanOptionalAction,`
	`124`	`+ default=False,`
`124`	`125`	`help="do everything except make changes to files",`
`125`	`126`	`)`
`126`	`127`	`parser.add_argument(`