Skip to content

Commit 2c63203

Browse files
authored
Fix default path include/exclude behavior; update codemod hierarchy (#780)
* Refactor codemod hierarchy * Move path include/exclude/default behavior into codemods * Remove `files_to_analyze` from detector API * Remove `files_to_analyze` from codemod API * Fix pygoat integration test * Better docstrings and type annotations
1 parent fd09145 commit 2c63203

23 files changed

+465
-170
lines changed

.github/workflows/codemod_pygoat.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,6 @@ jobs:
3838
repository: pixee/pygoat
3939
path: pygoat
4040
- name: Run Codemodder
41-
run: codemodder --output output.codetf pygoat
41+
run: codemodder --dry-run --output output.codetf pygoat
4242
- name: Check PyGoat Findings
4343
run: make pygoat-test

ci_tests/test_pygoat_findings.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@
66
"pixee:python/add-requests-timeouts",
77
"pixee:python/secure-random",
88
"pixee:python/sandbox-process-creation",
9+
"pixee:python/subprocess-shell-false",
910
"pixee:python/django-session-cookie-secure-off",
11+
"pixee:python/django-model-without-dunder-str",
1012
"pixee:python/harden-pyyaml",
1113
"pixee:python/django-debug-flag-on",
1214
"pixee:python/url-sandbox",

src/codemodder/cli.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import sys
44

55
from codemodder import __version__
6-
from codemodder.code_directory import DEFAULT_EXCLUDED_PATHS
76
from codemodder.logging import OutputFormat, logger
87
from codemodder.registry import CodemodRegistry
98

@@ -143,7 +142,7 @@ def parse_args(argv, codemod_registry: CodemodRegistry):
143142
parser.add_argument(
144143
"--path-exclude",
145144
action=CsvListAction,
146-
default=DEFAULT_EXCLUDED_PATHS,
145+
default=[],
147146
help="Comma-separated set of UNIX glob patterns to exclude",
148147
)
149148
parser.add_argument(

src/codemodder/code_directory.py

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -39,54 +39,63 @@ def file_line_patterns(file_path: str | Path, patterns: Sequence[str]):
3939
]
4040

4141

42-
def filter_files(names: Sequence[str], patterns: Sequence[str], exclude: bool = False):
42+
def filter_files(names: list[Path], patterns: Sequence[str], exclude: bool = False):
4343
patterns = (
4444
[x.split(":")[0] for x in (patterns or [])]
4545
if not exclude
4646
# An excluded line should not cause the entire file to be excluded
4747
else [x for x in (patterns or []) if ":" not in x]
4848
)
49-
return itertools.chain(*[fnmatch.filter(names, pattern) for pattern in patterns])
49+
return itertools.chain(
50+
*[fnmatch.filter((str(x) for x in names), pattern) for pattern in patterns]
51+
)
52+
53+
54+
def files_for_directory(parent_path: Path) -> list[Path]:
55+
"""
56+
Return list of all (non-symlink) file paths within a directory, recursively.
57+
"""
58+
return [
59+
path
60+
for path in Path(parent_path).rglob("*")
61+
if Path(path).is_file() and not Path(path).is_symlink()
62+
]
5063

5164

5265
def match_files(
53-
parent_path: str | Path,
66+
parent_path: Path,
67+
input_paths: list[Path],
5468
exclude_paths: Optional[Sequence[str]] = None,
5569
include_paths: Optional[Sequence[str]] = None,
56-
):
70+
) -> list[Path]:
5771
"""
5872
Find pattern-matching files starting at the parent_path, recursively.
5973
6074
If a file matches any exclude pattern, it is not matched. If any include
61-
patterns are passed in, a file must match `*.py` and at least one include patterns.
75+
patterns are passed in, a file must match at least one include patterns.
6276
6377
:param parent_path: str name for starting directory
64-
:param exclude_paths: list of UNIX glob patterns to exclude
65-
:param include_paths: list of UNIX glob patterns to exclude
78+
:param exclude_paths: list of UNIX glob patterns to exclude, uses DEFAULT_EXCLUDED_PATHS if None
79+
:param include_paths: list of UNIX glob patterns to exclude, uses DEFAULT_INCLUDED_PATHS if None
6680
6781
:return: list of <pathlib.PosixPath> files found within (including recursively) the parent directory
6882
that match the criteria of both exclude and include patterns.
6983
"""
70-
all_files = [
71-
str(Path(path).relative_to(parent_path))
72-
for path in Path(parent_path).rglob("*")
73-
]
84+
paths = [p.relative_to(parent_path) for p in input_paths]
7485
included_files = set(
7586
filter_files(
76-
all_files,
87+
paths,
7788
include_paths if include_paths is not None else DEFAULT_INCLUDED_PATHS,
7889
)
7990
)
8091
excluded_files = set(
8192
filter_files(
82-
all_files,
93+
paths,
8394
exclude_paths if exclude_paths is not None else DEFAULT_EXCLUDED_PATHS,
8495
exclude=True,
8596
)
8697
)
8798

8899
return [
89-
path
90-
for p in sorted(list(included_files - excluded_files))
91-
if (path := Path(parent_path).joinpath(p)).is_file()
100+
parent_path.joinpath(p) for p in sorted(list(included_files - excluded_files))
92101
]

src/codemodder/codemodder.py

Lines changed: 12 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
from codemodder import __version__, providers, registry
1010
from codemodder.cli import parse_args
11-
from codemodder.code_directory import match_files
1211
from codemodder.codemods.api import BaseCodemod
1312
from codemodder.codemods.semgrep import SemgrepRuleDetector
1413
from codemodder.codetf import CodeTF
@@ -72,38 +71,22 @@ def log_report(context, argv, elapsed_ms, files_to_analyze):
7271
def apply_codemods(
7372
context: CodemodExecutionContext,
7473
codemods_to_run: Sequence[BaseCodemod],
75-
semgrep_results: ResultSet,
76-
files_to_analyze: list[Path],
7774
):
7875
log_section("scanning")
7976

80-
if not files_to_analyze:
77+
if not context.files_to_analyze:
8178
logger.info("no files to scan")
8279
return
8380

8481
if not codemods_to_run:
8582
logger.info("no codemods to run")
8683
return
8784

88-
semgrep_finding_ids = semgrep_results.all_rule_ids()
89-
9085
# run codemods one at a time making sure to respect the given sequence
9186
for codemod in codemods_to_run:
9287
# NOTE: this may be used as a progress indicator by upstream tools
9388
logger.info("running codemod %s", codemod.id)
94-
95-
if isinstance(codemod.detector, SemgrepRuleDetector):
96-
if codemod._internal_name not in semgrep_finding_ids:
97-
logger.debug(
98-
"no results from semgrep for %s, skipping analysis",
99-
codemod.id,
100-
)
101-
continue
102-
103-
files_to_analyze = semgrep_results.files_for_rule(codemod._internal_name)
104-
105-
# Non-semgrep codemods ignore the semgrep results
106-
codemod.apply(context, files_to_analyze)
89+
codemod.apply(context)
10790
record_dependency_update(context.process_dependencies(codemod.id))
10891
context.log_changes(codemod.id)
10992

@@ -197,37 +180,24 @@ def run(original_args) -> int:
197180
sast_only=argv.sonar_issues_json or argv.sarif,
198181
)
199182

200-
included_paths = argv.path_include or codemod_registry.default_include_paths
201-
202183
log_section("setup")
203184
log_list(logging.INFO, "running", codemods_to_run, predicate=lambda c: c.id)
204-
log_list(logging.INFO, "including paths", included_paths)
185+
log_list(logging.INFO, "including paths", context.included_paths)
205186
log_list(logging.INFO, "excluding paths", argv.path_exclude)
206187

207-
files_to_analyze: list[Path] = [
208-
path
209-
for path in match_files(
210-
context.directory,
211-
argv.path_exclude,
212-
included_paths,
213-
)
214-
if path.is_file() and not path.is_symlink()
215-
]
216-
217-
full_names = [str(path) for path in files_to_analyze]
218-
log_list(logging.DEBUG, "matched files", full_names)
188+
log_list(
189+
logging.DEBUG, "matched files", (str(path) for path in context.files_to_analyze)
190+
)
219191

220-
semgrep_results: ResultSet = find_semgrep_results(
192+
context.semgrep_prefilter_results = find_semgrep_results(
221193
context,
222194
codemods_to_run,
223-
files_to_analyze,
195+
context.find_and_fix_paths,
224196
)
225197

226198
apply_codemods(
227199
context,
228200
codemods_to_run,
229-
semgrep_results,
230-
files_to_analyze,
231201
)
232202

233203
elapsed = datetime.datetime.now() - start
@@ -243,7 +213,10 @@ def run(original_args) -> int:
243213
codetf.write_report(argv.output)
244214

245215
log_report(
246-
context, argv, elapsed_ms, [] if not codemods_to_run else files_to_analyze
216+
context,
217+
argv,
218+
elapsed_ms,
219+
[] if not codemods_to_run else context.files_to_analyze,
247220
)
248221
return 0
249222

src/codemodder/codemods/api.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@
55

66
from codemodder.codemods.base_codemod import ( # noqa: F401
77
BaseCodemod,
8+
FindAndFixCodemod,
89
Metadata,
910
Reference,
11+
RemediationCodemod,
1012
ReviewGuidance,
1113
ToolMetadata,
1214
ToolRule,

0 commit comments

Comments
 (0)