Skip to content

Commit 7cf4deb

Browse files
authored
Cleanup around sarif processing (#886)
* fix typing and handle possible exceptions * fix type
1 parent bcdcc99 commit 7cf4deb

File tree

5 files changed

+47
-12
lines changed

5 files changed

+47
-12
lines changed

src/codemodder/codemodder.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ def run(
126126
verbose: bool = False,
127127
log_format: OutputFormat = OutputFormat.JSON,
128128
project_name: str | None = None,
129-
tool_result_files_map: DefaultDict[str, list[str]] = defaultdict(list),
129+
tool_result_files_map: DefaultDict[str, list[Path]] = defaultdict(list),
130130
path_include: list[str] | None = None,
131131
path_exclude: list[str] | None = None,
132132
codemod_include: list[str] | None = None,
@@ -240,8 +240,7 @@ def _run_cli(original_args) -> int:
240240
return 1
241241

242242
try:
243-
# TODO: this should be dict[str, list[Path]]
244-
tool_result_files_map: DefaultDict[str, list[str]] = detect_sarif_tools(
243+
tool_result_files_map: DefaultDict[str, list[Path]] = detect_sarif_tools(
245244
[Path(name) for name in argv.sarif or []]
246245
)
247246
except (DuplicateToolError, FileNotFoundError) as err:

src/codemodder/codemods/test/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ def run_and_assert(
193193
directory=root,
194194
dry_run=False,
195195
verbose=False,
196-
tool_result_files_map={self.tool: [str(tmp_results_file_path)]},
196+
tool_result_files_map={self.tool: [tmp_results_file_path]},
197197
registry=mock.MagicMock(),
198198
providers=load_providers(),
199199
repo_manager=mock.MagicMock(),

src/codemodder/context.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ class CodemodExecutionContext:
4949
path_include: list[str]
5050
path_exclude: list[str]
5151
max_workers: int = 1
52-
tool_result_files_map: dict[str, list[str]]
52+
tool_result_files_map: dict[str, list[Path]]
5353
semgrep_prefilter_results: ResultSet | None = None
5454
openai_llm_client: OpenAI | None = None
5555
azure_llama_llm_client: ChatCompletionsClient | None = None
@@ -64,7 +64,7 @@ def __init__(
6464
repo_manager: PythonRepoManager | None = None,
6565
path_include: list[str] | None = None,
6666
path_exclude: list[str] | None = None,
67-
tool_result_files_map: dict[str, list[str]] | None = None,
67+
tool_result_files_map: dict[str, list[Path]] | None = None,
6868
max_workers: int = 1,
6969
):
7070
self.directory = directory

src/codemodder/sarifs.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,27 @@ def detect(cls, run_data: dict) -> bool:
1818
class DuplicateToolError(ValueError): ...
1919

2020

21-
def detect_sarif_tools(filenames: list[Path]) -> DefaultDict[str, list[str]]:
22-
results: DefaultDict[str, list[str]] = defaultdict(list)
21+
def detect_sarif_tools(filenames: list[Path]) -> DefaultDict[str, list[Path]]:
22+
results: DefaultDict[str, list[Path]] = defaultdict(list)
2323

2424
logger.debug("loading registered SARIF tool detectors")
2525
detectors = {
2626
ent.name: ent.load() for ent in entry_points().select(group="sarif_detectors")
2727
}
2828
for fname in filenames:
29-
data = json.loads(fname.read_text(encoding="utf-8-sig"))
29+
try:
30+
data = json.loads(fname.read_text(encoding="utf-8-sig"))
31+
except json.JSONDecodeError:
32+
logger.exception("Malformed JSON file: %s", fname)
33+
raise
3034
for name, det in detectors.items():
31-
# TODO: handle malformed sarif?
32-
for run in data["runs"]:
35+
try:
36+
runs = data["runs"]
37+
except KeyError:
38+
logger.exception("Sarif file without `runs` data: %s", fname)
39+
raise
40+
41+
for run in runs:
3342
try:
3443
if det.detect(run):
3544
logger.debug("detected %s sarif: %s", name, fname)
@@ -39,7 +48,7 @@ def detect_sarif_tools(filenames: list[Path]) -> DefaultDict[str, list[str]]:
3948
raise DuplicateToolError(
4049
f"duplicate tool sarif detected: {name}"
4150
)
42-
results[name].append(str(fname))
51+
results[name].append(Path(fname))
4352
except DuplicateToolError as err:
4453
raise err
4554
except (KeyError, AttributeError, ValueError):

tests/test_sarif_processing.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ def test_detect_sarif_with_bom_encoding(self, tmpdir):
4545

4646
results = detect_sarif_tools([sarif_file_bom])
4747
assert len(results) == 1
48+
assert isinstance(results["semgrep"][0], Path)
4849

4950
@pytest.mark.parametrize("truncate", [True, False])
5051
def test_results_by_rule_id(self, truncate):
@@ -111,6 +112,32 @@ def test_two_sarifs_same_tool(self):
111112
detect_sarif_tools([Path("tests/samples/webgoat_v8.2.0_codeql.sarif")] * 2)
112113
assert "duplicate tool sarif detected: codeql" in str(exc.value)
113114

115+
def test_bad_sarif(self, tmpdir, caplog):
116+
sarif_file = Path("tests") / "samples" / "semgrep.sarif"
117+
bad_json = tmpdir / "bad.sarif"
118+
with open(bad_json, "w") as f:
119+
# remove all { to make a badly formatted json
120+
f.write(sarif_file.read_text(encoding="utf-8").replace("{", ""))
121+
122+
with pytest.raises(json.JSONDecodeError):
123+
detect_sarif_tools([bad_json])
124+
assert f"Malformed JSON file: {str(bad_json)}" in caplog.text
125+
126+
def test_bad_sarif_no_runs_data(self, tmpdir, caplog):
127+
bad_json = tmpdir / "bad.sarif"
128+
data = """
129+
{
130+
"$schema": "https://docs.oasis-open.org/sarif/sarif/v2.1.0/os/schemas/sarif-schema-2.1.0.json",
131+
"version": "2.1.0"
132+
}
133+
"""
134+
with open(bad_json, "w") as f:
135+
f.write(data)
136+
137+
with pytest.raises(KeyError):
138+
detect_sarif_tools([bad_json])
139+
assert f"Sarif file without `runs` data: {str(bad_json)}" in caplog.text
140+
114141
def test_two_sarifs_different_tools(self):
115142
results = detect_sarif_tools(
116143
[

0 commit comments

Comments
 (0)