From cf2d3fdf7965b87224ef2c16ea688f5b8a8b61d7 Mon Sep 17 00:00:00 2001 From: Daniel D'Avella Date: Tue, 1 Apr 2025 12:12:03 -0400 Subject: [PATCH 1/6] Use Sarif data model for detection --- pyproject.toml | 1 + src/codemodder/codeql.py | 6 +++--- src/codemodder/sarifs.py | 24 ++++++++++++------------ src/codemodder/semgrep.py | 9 +++------ tests/test_sarif_processing.py | 9 +++++---- 5 files changed, 24 insertions(+), 25 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 15a4288f..b54dfb87 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ "tomlkit~=0.13.0", "wrapt~=1.17.0", "chardet~=5.2.0", + "sarif-pydantic~=0.1.0", "setuptools~=78.1", ] keywords = ["codemod", "codemods", "security", "fix", "fixes"] diff --git a/src/codemodder/codeql.py b/src/codemodder/codeql.py index 09bffa64..62649d70 100644 --- a/src/codemodder/codeql.py +++ b/src/codemodder/codeql.py @@ -4,13 +4,13 @@ from typing_extensions import Self from codemodder.result import LineInfo, ResultSet, SarifLocation, SarifResult -from codemodder.sarifs import AbstractSarifToolDetector +from codemodder.sarifs import AbstractSarifToolDetector, Run class CodeQLSarifToolDetector(AbstractSarifToolDetector): @classmethod - def detect(cls, run_data: dict) -> bool: - return "tool" in run_data and "CodeQL" in run_data["tool"]["driver"]["name"] + def detect(cls, run_data: Run) -> bool: + return "CodeQL" in run_data.tool.driver.name class CodeQLLocation(SarifLocation): diff --git a/src/codemodder/sarifs.py b/src/codemodder/sarifs.py index 85f4157b..bb9af1e7 100644 --- a/src/codemodder/sarifs.py +++ b/src/codemodder/sarifs.py @@ -1,17 +1,19 @@ -import json from abc import ABCMeta, abstractmethod from collections import defaultdict from importlib.metadata import entry_points from pathlib import Path from typing import DefaultDict +from pydantic import ValidationError +from sarif_pydantic import Run, Sarif + from codemodder.logging import logger class AbstractSarifToolDetector(metaclass=ABCMeta): @classmethod @abstractmethod - def detect(cls, run_data: dict) -> bool: + def detect(cls, run_data: Run) -> bool: pass @@ -24,18 +26,16 @@ def detect_sarif_tools(filenames: list[Path]) -> DefaultDict[str, list[Path]]: } for fname in filenames: try: - data = json.loads(fname.read_text(encoding="utf-8-sig")) - except json.JSONDecodeError: - logger.exception("Malformed JSON file: %s", fname) + data = Sarif.model_validate_json(fname.read_text(encoding="utf-8-sig")) + except ValidationError: + logger.exception("Invalid SARIF file: %s", fname) raise - for name, det in detectors.items(): - try: - runs = data["runs"] - except KeyError: - logger.exception("Sarif file without `runs` data: %s", fname) - raise - for run in runs: + if not data.runs: + raise ValueError(f"SARIF file without `runs` data: {fname}") + + for name, det in detectors.items(): + for run in data.runs: try: if det.detect(run): logger.debug("detected %s sarif: %s", name, fname) diff --git a/src/codemodder/semgrep.py b/src/codemodder/semgrep.py index 699eaa83..35e06b90 100644 --- a/src/codemodder/semgrep.py +++ b/src/codemodder/semgrep.py @@ -10,16 +10,13 @@ from codemodder.context import CodemodExecutionContext from codemodder.logging import logger from codemodder.result import Result, ResultSet, SarifLocation, SarifResult -from codemodder.sarifs import AbstractSarifToolDetector +from codemodder.sarifs import AbstractSarifToolDetector, Run class SemgrepSarifToolDetector(AbstractSarifToolDetector): @classmethod - def detect(cls, run_data: dict) -> bool: - return ( - "tool" in run_data - and "semgrep" in run_data["tool"]["driver"]["name"].lower() - ) + def detect(cls, run_data: Run) -> bool: + return "semgrep" in run_data.tool.driver.name class SemgrepLocation(SarifLocation): diff --git a/tests/test_sarif_processing.py b/tests/test_sarif_processing.py index f7d4f2a9..ca204236 100644 --- a/tests/test_sarif_processing.py +++ b/tests/test_sarif_processing.py @@ -3,6 +3,7 @@ from pathlib import Path import pytest +from pydantic import ValidationError from codemodder.codemods.semgrep import process_semgrep_findings from codemodder.sarifs import detect_sarif_tools @@ -119,9 +120,9 @@ def test_bad_sarif(self, tmpdir, caplog): # remove all { to make a badly formatted json f.write(sarif_file.read_text(encoding="utf-8").replace("{", "")) - with pytest.raises(json.JSONDecodeError): + with pytest.raises(ValidationError): detect_sarif_tools([bad_json]) - assert f"Malformed JSON file: {str(bad_json)}" in caplog.text + assert f"Invalid SARIF file: {str(bad_json)}" in caplog.text def test_bad_sarif_no_runs_data(self, tmpdir, caplog): bad_json = tmpdir / "bad.sarif" @@ -134,9 +135,9 @@ def test_bad_sarif_no_runs_data(self, tmpdir, caplog): with open(bad_json, "w") as f: f.write(data) - with pytest.raises(KeyError): + with pytest.raises(ValidationError): detect_sarif_tools([bad_json]) - assert f"Sarif file without `runs` data: {str(bad_json)}" in caplog.text + assert f"Invalid SARIF file: {str(bad_json)}" in caplog.text def test_two_sarifs_different_tools(self): results = detect_sarif_tools( From e2a20cedda1c2d25152e5451d5ea8e92c59ca3df Mon Sep 17 00:00:00 2001 From: Daniel D'Avella Date: Tue, 1 Apr 2025 21:27:15 -0400 Subject: [PATCH 2/6] Intermediate progress: update to sarif-pydantic 0.2.0 --- pyproject.toml | 2 +- src/codemodder/codeql.py | 13 +++++----- src/codemodder/result.py | 55 ++++++++++++++++++++++++---------------- 3 files changed, 41 insertions(+), 29 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b54dfb87..86a108ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "tomlkit~=0.13.0", "wrapt~=1.17.0", "chardet~=5.2.0", - "sarif-pydantic~=0.1.0", + "sarif-pydantic~=0.2.0", "setuptools~=78.1", ] keywords = ["codemod", "codemods", "security", "fix", "fixes"] diff --git a/src/codemodder/codeql.py b/src/codemodder/codeql.py index 62649d70..a5e2cee9 100644 --- a/src/codemodder/codeql.py +++ b/src/codemodder/codeql.py @@ -1,6 +1,6 @@ -import json from pathlib import Path +from sarif_pydantic import Sarif from typing_extensions import Self from codemodder.result import LineInfo, ResultSet, SarifLocation, SarifResult @@ -52,16 +52,17 @@ def rule_url_from_id(cls, result: dict, run: dict, rule_id: str) -> str: class CodeQLResultSet(ResultSet): @classmethod def from_sarif(cls, sarif_file: str | Path, truncate_rule_id: bool = False) -> Self: - with open(sarif_file, "r", encoding="utf-8") as f: - data = json.load(f) + data = Sarif.model_validate_json( + Path(sarif_file).read_text(encoding="utf-8-sig") + ) result_set = cls() - for sarif_run in data["runs"]: + for sarif_run in data.runs: if CodeQLSarifToolDetector.detect(sarif_run): - for sarif_result in sarif_run["results"]: + for sarif_result in sarif_run.results or []: codeql_result = CodeQLResult.from_sarif( sarif_result, sarif_run, truncate_rule_id ) result_set.add_result(codeql_result) - result_set.store_tool_data(sarif_run.get("tool", {})) + result_set.store_tool_data(sarif_run.tool.model_dump()) return result_set diff --git a/src/codemodder/result.py b/src/codemodder/result.py index 1c238e48..0403a974 100644 --- a/src/codemodder/result.py +++ b/src/codemodder/result.py @@ -9,6 +9,9 @@ import libcst as cst from boltons.setutils import IndexedSet from libcst._position import CodeRange +from sarif_pydantic import Location as LocationModel +from sarif_pydantic import Result as ResultModel +from sarif_pydantic import Run from typing_extensions import Self from codemodder.codetf import Finding, Rule @@ -41,7 +44,7 @@ def get_snippet(sarif_location) -> str: pass @classmethod - def from_sarif(cls, sarif_location) -> Self: + def from_sarif(cls, sarif_location: LocationModel) -> Self: artifact_location = sarif_location["physicalLocation"]["artifactLocation"] file = Path(artifact_location["uri"]) snippet = cls.get_snippet(sarif_location) @@ -102,7 +105,7 @@ class SarifResult(SASTResult): @classmethod def from_sarif( - cls, sarif_result, sarif_run, truncate_rule_id: bool = False + cls, sarif_result: ResultModel, sarif_run: Run, truncate_rule_id: bool = False ) -> Self: rule_id = cls.extract_rule_id(sarif_result, sarif_run, truncate_rule_id) finding_id = cls.extract_finding_id(sarif_result) or rule_id @@ -124,20 +127,25 @@ def from_sarif( ) @classmethod - def extract_finding_message(cls, sarif_result: dict, sarif_run: dict) -> str | None: - return sarif_result.get("message", {}).get("text", None) + def extract_finding_message( + cls, sarif_result: ResultModel, sarif_run: Run + ) -> str | None: + del sarif_run + return sarif_result.message.text @classmethod - def rule_url_from_id(cls, result: dict, run: dict, rule_id: str) -> str | None: + def rule_url_from_id( + cls, result: ResultModel, run: Run, rule_id: str + ) -> str | None: del result, run, rule_id return None @classmethod - def extract_locations(cls, sarif_result) -> Sequence[Location]: + def extract_locations(cls, sarif_result: ResultModel) -> Sequence[Location]: return tuple( [ cls.location_type.from_sarif(location) - for location in sarif_result["locations"] + for location in sarif_result.locations or [] ] ) @@ -154,38 +162,41 @@ def extract_related_locations(cls, sarif_result) -> Sequence[LocationWithMessage ) @classmethod - def extract_code_flows(cls, sarif_result) -> Sequence[Sequence[Location]]: + def extract_code_flows( + cls, sarif_result: ResultModel + ) -> Sequence[Sequence[Location]]: return tuple( [ tuple( [ - cls.location_type.from_sarif(locations.get("location")) - for locations in threadflow.get("locations", {}) + cls.location_type.from_sarif(locations.location) + for locations in threadflow.locations or [] + if locations.location ] ) - for codeflow in sarif_result.get("codeFlows", {}) - for threadflow in codeflow.get("threadFlows", {}) + for codeflow in sarif_result.code_flows or [] + for threadflow in codeflow.thread_flows or [] ] ) @classmethod - def extract_rule_id(cls, result, sarif_run, truncate_rule_id: bool = False) -> str: - if rule_id := result.get("ruleId"): + def extract_rule_id( + cls, result: ResultModel, sarif_run: Run, truncate_rule_id: bool = False + ) -> str: + if rule_id := result.rule_id: return rule_id.split(".")[-1] if truncate_rule_id else rule_id # it may be contained in the 'rule' field through the tool component in the sarif file - if "rule" in result: - tool_index = result["rule"]["toolComponent"]["index"] - rule_index = result["rule"]["index"] - return sarif_run["tool"]["extensions"][tool_index]["rules"][rule_index][ - "id" - ] + if (rule := result.rule) and sarif_run.tool.extensions and rule.tool_component: + tool_index = rule.tool_component.index + rule_index = rule.index + return sarif_run.tool.extensions[tool_index].rules[rule_index].id raise ValueError("Could not extract rule id from sarif result.") @classmethod - def extract_finding_id(cls, result) -> str | None: - return result.get("guid") or result.get("correlationGuid") + def extract_finding_id(cls, result: ResultModel) -> str | None: + return str(result.guid or "") or str(result.correlation_guid or "") or None def same_line(pos: CodeRange, location: Location) -> bool: From dc4f24c05199c2683c9abc63d56f9c3da0ec11e7 Mon Sep 17 00:00:00 2001 From: Daniel D'Avella Date: Wed, 2 Apr 2025 11:13:01 -0400 Subject: [PATCH 3/6] Fix some tests --- pyproject.toml | 2 +- src/codemodder/codeql.py | 33 +++++---- src/codemodder/result.py | 42 ++++++++---- src/codemodder/semgrep.py | 37 ++++++---- .../test_semgrep_django_secure_set_cookie.py | 3 +- .../test_semgrep_enable_jinja2_autoescape.py | 3 +- .../semgrep/test_semgrep_harden_pyyaml.py | 6 +- .../semgrep/test_semgrep_jwt_decode_verify.py | 3 +- .../semgrep/test_semgrep_nan_injection.py | 6 ++ .../semgrep/test_semgrep_no_csrf_exempt.py | 1 + .../semgrep/test_semgrep_rsa_key_size.py | 3 +- .../test_semgrep_sandbox_process_creation.py | 1 + .../test_semgrep_sql_parametrization.py | 3 +- .../test_semgrep_subprocess_shell_false.py | 3 +- .../semgrep/test_semgrep_url_sandbox.py | 1 + .../semgrep/test_semgrep_use_defused_xml.py | 3 +- tests/test_codeql.py | 12 ++-- tests/test_regex_transformer.py | 68 ++++++++++--------- tests/test_sarif_processing.py | 18 ++--- tests/test_semgrep.py | 6 +- tests/test_xml_transformer.py | 67 +++++++++--------- 21 files changed, 192 insertions(+), 129 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 86a108ee..711ac9b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "tomlkit~=0.13.0", "wrapt~=1.17.0", "chardet~=5.2.0", - "sarif-pydantic~=0.2.0", + "sarif-pydantic~=0.3.0", "setuptools~=78.1", ] keywords = ["codemod", "codemods", "security", "fix", "fixes"] diff --git a/src/codemodder/codeql.py b/src/codemodder/codeql.py index a5e2cee9..711a00ff 100644 --- a/src/codemodder/codeql.py +++ b/src/codemodder/codeql.py @@ -1,5 +1,7 @@ from pathlib import Path +from sarif_pydantic import Location as LocationModel +from sarif_pydantic import Result as ResultModel from sarif_pydantic import Sarif from typing_extensions import Self @@ -14,27 +16,30 @@ def detect(cls, run_data: Run) -> bool: class CodeQLLocation(SarifLocation): - @staticmethod - def get_snippet(sarif_location) -> str: - return "" - @classmethod - def from_sarif(cls, sarif_location) -> Self: - artifact_location = sarif_location["physicalLocation"]["artifactLocation"] - file = Path(artifact_location["uri"]) + def from_sarif(cls, sarif_location: LocationModel) -> Self: + if (physical_location := sarif_location.physical_location) is None: + raise ValueError("Location does not contain a physicalLocation") + if (artifact_location := physical_location.artifact_location) is None: + raise ValueError("PhysicalLocation does not contain an artifactLocation") + if (uri := artifact_location.uri) is None: + raise ValueError("ArtifactLocation does not contain a uri") + + file = Path(uri) - try: - region = sarif_location["physicalLocation"]["region"] - except KeyError: + if not (region := physical_location.region): # A location without a region indicates a result for the entire file. # Use sentinel values of 0 index for start/end zero = LineInfo(0) return cls(file=file, start=zero, end=zero) - start = LineInfo(line=region["startLine"], column=region.get("startColumn")) + if not region.start_line: + raise ValueError("Region does not contain a startLine") + + start = LineInfo(line=region.start_line, column=region.start_column or -1) end = LineInfo( - line=region.get("endLine", start.line), - column=region.get("endColumn", start.column), + line=region.end_line or start.line, + column=region.end_column or start.column, ) return cls(file=file, start=start, end=end) @@ -43,7 +48,7 @@ class CodeQLResult(SarifResult): location_type = CodeQLLocation @classmethod - def rule_url_from_id(cls, result: dict, run: dict, rule_id: str) -> str: + def rule_url_from_id(cls, result: ResultModel, run: Run, rule_id: str) -> str: del result, run, rule_id # TODO: Implement this method to return the specific rule URL return "https://codeql.github.com/codeql-query-help/" diff --git a/src/codemodder/result.py b/src/codemodder/result.py index 0403a974..023683ec 100644 --- a/src/codemodder/result.py +++ b/src/codemodder/result.py @@ -1,7 +1,6 @@ from __future__ import annotations import itertools -from abc import abstractmethod from dataclasses import dataclass, field from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar, Sequence, Type @@ -39,23 +38,30 @@ class Location(ABCDataclass): @dataclass(frozen=True) class SarifLocation(Location): @staticmethod - @abstractmethod - def get_snippet(sarif_location) -> str: - pass + def get_snippet(sarif_location: LocationModel) -> str | None: + return sarif_location.message.text if sarif_location.message else None @classmethod def from_sarif(cls, sarif_location: LocationModel) -> Self: - artifact_location = sarif_location["physicalLocation"]["artifactLocation"] - file = Path(artifact_location["uri"]) + if not (physical_location := sarif_location.physical_location): + raise ValueError("Sarif location does not have a physical location") + if not (artifact_location := physical_location.artifact_location): + raise ValueError("Sarif location does not have an artifact location") + if not (region := physical_location.region): + raise ValueError("Sarif location does not have a region") + if not (uri := artifact_location.uri): + raise ValueError("Sarif location does not have a uri") + + file = Path(uri) snippet = cls.get_snippet(sarif_location) start = LineInfo( - line=sarif_location["physicalLocation"]["region"]["startLine"], - column=sarif_location["physicalLocation"]["region"]["startColumn"], + line=region.start_line or -1, + column=region.start_column or -1, snippet=snippet, ) end = LineInfo( - line=sarif_location["physicalLocation"]["region"]["endLine"], - column=sarif_location["physicalLocation"]["region"]["endColumn"], + line=region.end_line or -1, + column=region.end_column or -1, snippet=snippet, ) return cls(file=file, start=start, end=end) @@ -150,14 +156,17 @@ def extract_locations(cls, sarif_result: ResultModel) -> Sequence[Location]: ) @classmethod - def extract_related_locations(cls, sarif_result) -> Sequence[LocationWithMessage]: + def extract_related_locations( + cls, sarif_result: ResultModel + ) -> Sequence[LocationWithMessage]: return tuple( [ LocationWithMessage( - message=rel_location.get("message", {}).get("text", ""), + message=rel_location.message.text, location=cls.location_type.from_sarif(rel_location), ) - for rel_location in sarif_result.get("relatedLocations", []) + for rel_location in sarif_result.related_locations or [] + if rel_location.message ] ) @@ -187,7 +196,12 @@ def extract_rule_id( return rule_id.split(".")[-1] if truncate_rule_id else rule_id # it may be contained in the 'rule' field through the tool component in the sarif file - if (rule := result.rule) and sarif_run.tool.extensions and rule.tool_component: + if ( + (rule := result.rule) + and sarif_run.tool.extensions + and rule.tool_component + and rule.tool_component.index is not None + ): tool_index = rule.tool_component.index rule_index = rule.index return sarif_run.tool.extensions[tool_index].rules[rule_index].id diff --git a/src/codemodder/semgrep.py b/src/codemodder/semgrep.py index 35e06b90..fa2792fb 100644 --- a/src/codemodder/semgrep.py +++ b/src/codemodder/semgrep.py @@ -1,38 +1,52 @@ import itertools -import json import subprocess from pathlib import Path from tempfile import NamedTemporaryFile from typing import Iterable, Optional +from sarif_pydantic import Sarif from typing_extensions import Self, override from codemodder.context import CodemodExecutionContext from codemodder.logging import logger -from codemodder.result import Result, ResultSet, SarifLocation, SarifResult +from codemodder.result import ( + LocationModel, + Result, + ResultModel, + ResultSet, + SarifLocation, + SarifResult, +) from codemodder.sarifs import AbstractSarifToolDetector, Run class SemgrepSarifToolDetector(AbstractSarifToolDetector): @classmethod def detect(cls, run_data: Run) -> bool: - return "semgrep" in run_data.tool.driver.name + return "semgrep" in run_data.tool.driver.name.lower() class SemgrepLocation(SarifLocation): @staticmethod - def get_snippet(sarif_location) -> str: + def get_snippet(sarif_location: LocationModel) -> str | None: return ( - sarif_location["physicalLocation"]["region"].get("snippet", {}).get("text") + sarif_location.physical_location.region.snippet.text + if sarif_location.physical_location + and sarif_location.physical_location.region + and sarif_location.physical_location.region.snippet + else SarifLocation.get_snippet(sarif_location) ) class SemgrepResult(SarifResult): location_type = SemgrepLocation + @override @classmethod - def rule_url_from_id(cls, sarif_result, sarif_run, rule_id): - del sarif_result, sarif_run + def rule_url_from_id( + cls, result: ResultModel, run: Run, rule_id: str + ) -> str | None: + del result, run from core_codemods.semgrep.api import semgrep_url_from_id return semgrep_url_from_id(rule_id) @@ -41,17 +55,16 @@ def rule_url_from_id(cls, sarif_result, sarif_run, rule_id): class SemgrepResultSet(ResultSet): @classmethod def from_sarif(cls, sarif_file: str | Path, truncate_rule_id: bool = False) -> Self: - with open(sarif_file, "r", encoding="utf-8") as f: - data = json.load(f) + data = Sarif.model_validate_json(Path(sarif_file).read_text()) result_set = cls() - for sarif_run in data["runs"]: - for result in sarif_run["results"]: + for sarif_run in data.runs: + for result in sarif_run.results or []: sarif_result = SemgrepResult.from_sarif( result, sarif_run, truncate_rule_id ) result_set.add_result(sarif_result) - result_set.store_tool_data(sarif_run.get("tool", {})) + result_set.store_tool_data(sarif_run.tool.model_dump()) return result_set diff --git a/tests/codemods/semgrep/test_semgrep_django_secure_set_cookie.py b/tests/codemods/semgrep/test_semgrep_django_secure_set_cookie.py index f1c7c46e..7b8fe16a 100644 --- a/tests/codemods/semgrep/test_semgrep_django_secure_set_cookie.py +++ b/tests/codemods/semgrep/test_semgrep_django_secure_set_cookie.py @@ -38,6 +38,7 @@ def index(request, template): results = { "runs": [ { + "tool": {"driver": {"name": "Semgrep OSS"}}, "results": [ { "fingerprints": {"matchBasedId/v1": "123"}, @@ -66,7 +67,7 @@ def index(request, template): "properties": {}, "ruleId": "python.django.security.audit.secure-cookies.django-secure-set-cookie", } - ] + ], } ] } diff --git a/tests/codemods/semgrep/test_semgrep_enable_jinja2_autoescape.py b/tests/codemods/semgrep/test_semgrep_enable_jinja2_autoescape.py index acb84e10..79b95164 100644 --- a/tests/codemods/semgrep/test_semgrep_enable_jinja2_autoescape.py +++ b/tests/codemods/semgrep/test_semgrep_enable_jinja2_autoescape.py @@ -27,6 +27,7 @@ def test_import(self, tmpdir): results = { "runs": [ { + "tool": {"driver": {"name": "Semgrep OSS"}}, "results": [ { "fingerprints": {"matchBasedId/v1": "123"}, @@ -55,7 +56,7 @@ def test_import(self, tmpdir): "properties": {}, "ruleId": "python.flask.security.xss.audit.direct-use-of-jinja2.direct-use-of-jinja2", } - ] + ], } ] } diff --git a/tests/codemods/semgrep/test_semgrep_harden_pyyaml.py b/tests/codemods/semgrep/test_semgrep_harden_pyyaml.py index 624ecf93..e0d6fae2 100644 --- a/tests/codemods/semgrep/test_semgrep_harden_pyyaml.py +++ b/tests/codemods/semgrep/test_semgrep_harden_pyyaml.py @@ -26,6 +26,7 @@ def test_pyyaml(self, tmpdir): results = { "runs": [ { + "tool": {"driver": {"name": "Semgrep OSS"}}, "results": [ { "fingerprints": {"matchBasedId/v1": "123"}, @@ -54,7 +55,7 @@ def test_pyyaml(self, tmpdir): "properties": {}, "ruleId": "python.lang.security.deserialization.avoid-pyyaml-load.avoid-pyyaml-load", } - ] + ], } ] } @@ -84,6 +85,7 @@ def index(request): results = { "runs": [ { + "tool": {"driver": {"name": "Semgrep OSS"}}, "results": [ { "fingerprints": {"matchBasedId/v1": "123"}, @@ -112,7 +114,7 @@ def index(request): "properties": {}, "ruleId": "python.django.security.audit.avoid-insecure-deserialization.avoid-insecure-deserialization", } - ] + ], } ] } diff --git a/tests/codemods/semgrep/test_semgrep_jwt_decode_verify.py b/tests/codemods/semgrep/test_semgrep_jwt_decode_verify.py index 8d050904..8a560ed6 100644 --- a/tests/codemods/semgrep/test_semgrep_jwt_decode_verify.py +++ b/tests/codemods/semgrep/test_semgrep_jwt_decode_verify.py @@ -25,6 +25,7 @@ def test_import(self, tmpdir): results = { "runs": [ { + "tool": {"driver": {"name": "Semgrep OSS"}}, "results": [ { "fingerprints": {"matchBasedId/v1": "123"}, @@ -52,7 +53,7 @@ def test_import(self, tmpdir): }, "ruleId": "python.jwt.security.unverified-jwt-decode.unverified-jwt-decode", } - ] + ], } ] } diff --git a/tests/codemods/semgrep/test_semgrep_nan_injection.py b/tests/codemods/semgrep/test_semgrep_nan_injection.py index 4e0921ba..6a4d7b49 100644 --- a/tests/codemods/semgrep/test_semgrep_nan_injection.py +++ b/tests/codemods/semgrep/test_semgrep_nan_injection.py @@ -38,6 +38,7 @@ def home(request): results = { "runs": [ { + "tool": {"driver": {"name": "Semgrep OSS"}}, "results": [ { "fingerprints": {"matchBasedId/v1": "1932"}, @@ -112,6 +113,7 @@ def view(request): results = { "runs": [ { + "tool": {"driver": {"name": "Semgrep OSS"}}, "results": [ { "fingerprints": {"matchBasedId/v1": "1fdbd5a"}, @@ -247,6 +249,7 @@ def view(request): results = { "runs": [ { + "tool": {"driver": {"name": "Semgrep OSS"}}, "results": [ { "fingerprints": {"matchBasedId/v1": "asdfg"}, @@ -304,6 +307,7 @@ def view(request): results = { "runs": [ { + "tool": {"driver": {"name": "Semgrep OSS"}}, "results": [ { "fingerprints": {"matchBasedId/v1": "q324"}, @@ -359,6 +363,7 @@ def view(request): results = { "runs": [ { + "tool": {"driver": {"name": "Semgrep OSS"}}, "results": [ { "fingerprints": {"matchBasedId/v1": "asdtg"}, @@ -416,6 +421,7 @@ def view(request): results = { "runs": [ { + "tool": {"driver": {"name": "Semgrep OSS"}}, "results": [ { "fingerprints": {"matchBasedId/v1": "asd2"}, diff --git a/tests/codemods/semgrep/test_semgrep_no_csrf_exempt.py b/tests/codemods/semgrep/test_semgrep_no_csrf_exempt.py index 50fa61d2..80e7e287 100644 --- a/tests/codemods/semgrep/test_semgrep_no_csrf_exempt.py +++ b/tests/codemods/semgrep/test_semgrep_no_csrf_exempt.py @@ -52,6 +52,7 @@ def foo(): results = { "runs": [ { + "tool": {"driver": {"name": "Semgrep OSS"}}, "results": [ { "fingerprints": {"matchBasedId/v1": "a3ca2"}, diff --git a/tests/codemods/semgrep/test_semgrep_rsa_key_size.py b/tests/codemods/semgrep/test_semgrep_rsa_key_size.py index 04434a7c..28773fdc 100644 --- a/tests/codemods/semgrep/test_semgrep_rsa_key_size.py +++ b/tests/codemods/semgrep/test_semgrep_rsa_key_size.py @@ -15,6 +15,7 @@ def _run_and_assert_with_results(self, tmpdir, input_code, expected_output): results = { "runs": [ { + "tool": {"driver": {"name": "Semgrep OSS"}}, "results": [ { "fingerprints": {"matchBasedId/v1": "123"}, @@ -43,7 +44,7 @@ def _run_and_assert_with_results(self, tmpdir, input_code, expected_output): "properties": {}, "ruleId": "python.cryptography.security.insufficient-rsa-key-size.insufficient-rsa-key-size", } - ] + ], } ] } diff --git a/tests/codemods/semgrep/test_semgrep_sandbox_process_creation.py b/tests/codemods/semgrep/test_semgrep_sandbox_process_creation.py index 845a6040..e8e25d4a 100644 --- a/tests/codemods/semgrep/test_semgrep_sandbox_process_creation.py +++ b/tests/codemods/semgrep/test_semgrep_sandbox_process_creation.py @@ -53,6 +53,7 @@ def vuln(): { "runs": [ { + "tool": {"driver": {"name": "Semgrep OSS"}}, "automationDetails": { "id": ".github/workflows/semgrep.yml:semgrep_scan/" }, diff --git a/tests/codemods/semgrep/test_semgrep_sql_parametrization.py b/tests/codemods/semgrep/test_semgrep_sql_parametrization.py index 0d79cff7..11dbd1a2 100644 --- a/tests/codemods/semgrep/test_semgrep_sql_parametrization.py +++ b/tests/codemods/semgrep/test_semgrep_sql_parametrization.py @@ -49,6 +49,7 @@ def f(): results = { "runs": [ { + "tool": {"driver": {"name": "Semgrep OSS"}}, "results": [ { "fingerprints": {"matchBasedId/v1": "123"}, @@ -185,7 +186,7 @@ def f(): "properties": {}, "ruleId": "python.flask.security.injection.tainted-sql-string.tainted-sql-string", }, - ] + ], } ] } diff --git a/tests/codemods/semgrep/test_semgrep_subprocess_shell_false.py b/tests/codemods/semgrep/test_semgrep_subprocess_shell_false.py index 313a10bf..3dff942e 100644 --- a/tests/codemods/semgrep/test_semgrep_subprocess_shell_false.py +++ b/tests/codemods/semgrep/test_semgrep_subprocess_shell_false.py @@ -26,6 +26,7 @@ def test_import(self, tmpdir): results = { "runs": [ { + "tool": {"driver": {"name": "Semgrep OSS"}}, "results": [ { "fingerprints": {"matchBasedId/v1": "123"}, @@ -54,7 +55,7 @@ def test_import(self, tmpdir): "properties": {}, "ruleId": "python.lang.security.audit.subprocess-shell-true.subprocess-shell-true", } - ] + ], } ] } diff --git a/tests/codemods/semgrep/test_semgrep_url_sandbox.py b/tests/codemods/semgrep/test_semgrep_url_sandbox.py index 82a619a4..31a02668 100644 --- a/tests/codemods/semgrep/test_semgrep_url_sandbox.py +++ b/tests/codemods/semgrep/test_semgrep_url_sandbox.py @@ -41,6 +41,7 @@ def example(): results = { "runs": [ { + "tool": {"driver": {"name": "Semgrep OSS"}}, "results": [ { "fingerprints": {"matchBasedId/v1": "370059975f"}, diff --git a/tests/codemods/semgrep/test_semgrep_use_defused_xml.py b/tests/codemods/semgrep/test_semgrep_use_defused_xml.py index 8c9fbd4e..49f3d2d5 100644 --- a/tests/codemods/semgrep/test_semgrep_use_defused_xml.py +++ b/tests/codemods/semgrep/test_semgrep_use_defused_xml.py @@ -31,6 +31,7 @@ def test_etree_parse(self, add_dependency, tmpdir): results = { "runs": [ { + "tool": {"driver": {"name": "Semgrep OSS"}}, "results": [ { "fingerprints": {"matchBasedId/v1": "123"}, @@ -58,7 +59,7 @@ def test_etree_parse(self, add_dependency, tmpdir): }, "ruleId": "python.lang.security.use-defused-xml-parse.use-defused-xml-parse", } - ] + ], } ] } diff --git a/tests/test_codeql.py b/tests/test_codeql.py index ea219e83..55bb7842 100644 --- a/tests/test_codeql.py +++ b/tests/test_codeql.py @@ -1,6 +1,7 @@ import json +import tempfile from pathlib import Path -from unittest import TestCase, mock +from unittest import TestCase import pytest @@ -36,9 +37,9 @@ class TestCodeQLResultSet(TestCase): def test_from_sarif(self): sarif_file = Path("/path/to/sarif/file.sarif") - with mock.patch( - "builtins.open", mock.mock_open(read_data=json.dumps(SARIF_CONTENT)) - ): + with tempfile.TemporaryDirectory() as tempdir: + sarif_file = Path(tempdir) / "file.sarif" + sarif_file.write_text(json.dumps(SARIF_CONTENT)) result_set = CodeQLResultSet.from_sarif(sarif_file) self.assertEqual(len(result_set), 3) @@ -211,11 +212,12 @@ def test_from_sarif(self): "driver": {"name": "CodeQL"}, "extensions": [ { + "name": "CodeQL", "rules": [ {"id": "python/sql-injection"}, {"id": "cs/web/missing-x-frame-options"}, {"id": "cs/web/xss"}, - ] + ], }, ], }, diff --git a/tests/test_regex_transformer.py b/tests/test_regex_transformer.py index b4a4c122..5f794b45 100644 --- a/tests/test_regex_transformer.py +++ b/tests/test_regex_transformer.py @@ -1,5 +1,7 @@ import logging +from sarif_pydantic import Sarif + from codemodder.codemods.regex_transformer import ( RegexTransformerPipeline, SastRegexTransformerPipeline, @@ -138,38 +140,42 @@ def test_sast_transformer(mocker, tmp_path_factory): pattern=r"hello", replacement="bye", change_description="testing" ) - data = { - "runs": [ - { - "results": [ - { - "fingerprints": {"matchBasedId/v1": "123"}, - "locations": [ - { - "ruleId": "rule", - "physicalLocation": { - "artifactLocation": { - "uri": "code.py", - "uriBaseId": "%SRCROOT%", - }, - "region": { - "snippet": {"text": "snip"}, - "endColumn": 1, - "endLine": 1, - "startColumn": 1, - "startLine": 1, + data = Sarif.model_validate( + { + "runs": [ + { + "tool": {"driver": {"name": "Semgrep OSS"}}, + "results": [ + { + "message": {"text": "message"}, + "fingerprints": {"matchBasedId/v1": "123"}, + "locations": [ + { + "ruleId": "rule", + "physicalLocation": { + "artifactLocation": { + "uri": "code.py", + "uriBaseId": "%SRCROOT%", + }, + "region": { + "snippet": {"text": "snip"}, + "endColumn": 1, + "endLine": 1, + "startColumn": 1, + "startLine": 1, + }, }, - }, - } - ], - "ruleId": "rule", - } - ] - } - ] - } - sarif_run = data["runs"] - sarif_results = sarif_run[0]["results"] + } + ], + "ruleId": "rule", + } + ], + } + ] + } + ) + sarif_run = data.runs[0] + sarif_results = sarif_run.results or [] results = [SemgrepResult.from_sarif(sarif_results[0], sarif_run)] changeset = pipeline.apply( diff --git a/tests/test_sarif_processing.py b/tests/test_sarif_processing.py index ca204236..cdaa6c63 100644 --- a/tests/test_sarif_processing.py +++ b/tests/test_sarif_processing.py @@ -1,9 +1,9 @@ -import json import subprocess from pathlib import Path import pytest from pydantic import ValidationError +from sarif_pydantic import Sarif from codemodder.codemods.semgrep import process_semgrep_findings from codemodder.sarifs import detect_sarif_tools @@ -14,11 +14,11 @@ class TestSarifProcessing: def test_extract_rule_id_codeql(self): sarif_file = Path("tests") / "samples" / "webgoat_v8.2.0_codeql.sarif" - with open(sarif_file, "r", encoding="utf-8") as f: - data = json.load(f) + data = Sarif.model_validate_json(sarif_file.read_text()) - sarif_run = data["runs"][0] - result = sarif_run["results"][0] + sarif_run = data.runs[0] + assert sarif_run.results, "No results found in SARIF file" + result = sarif_run.results[0] rule_id = SemgrepResult.extract_rule_id( result, sarif_run, truncate_rule_id=True ) @@ -27,11 +27,11 @@ def test_extract_rule_id_codeql(self): def test_extract_rule_id_semgrep(self): sarif_file = Path("tests") / "samples" / "semgrep.sarif" - with open(sarif_file, "r", encoding="utf-8") as f: - data = json.load(f) + data = Sarif.model_validate_json(sarif_file.read_text()) - sarif_run = data["runs"][0] - result = sarif_run["results"][0] + sarif_run = data.runs[0] + assert sarif_run.results, "No results found in SARIF file" + result = sarif_run.results[0] rule_id = SemgrepResult.extract_rule_id( result, sarif_run, truncate_rule_id=True ) diff --git a/tests/test_semgrep.py b/tests/test_semgrep.py index b0e4cda7..f25278d6 100644 --- a/tests/test_semgrep.py +++ b/tests/test_semgrep.py @@ -1,7 +1,7 @@ -import json from pathlib import Path import pytest +from sarif_pydantic import Sarif from codemodder.codemods.semgrep import SemgrepSarifFileDetector from codemodder.context import CodemodExecutionContext @@ -20,8 +20,8 @@ def test_semgrep_sarif_tool_detector(filename, expected): detector = SemgrepSarifToolDetector() sarif_path = SAMPLE_DATA_PATH / filename - data = json.load(sarif_path.open()) - assert detector.detect(data["runs"][0]) is expected + data = Sarif.model_validate_json(sarif_path.read_text()) + assert detector.detect(data.runs[0]) is expected def test_semgrep_sarif_codemode_detector(mocker): diff --git a/tests/test_xml_transformer.py b/tests/test_xml_transformer.py index bd1c3192..34956569 100644 --- a/tests/test_xml_transformer.py +++ b/tests/test_xml_transformer.py @@ -6,6 +6,7 @@ import pytest from defusedxml import ExternalReferenceForbidden from defusedxml.sax import make_parser +from sarif_pydantic import Sarif from codemodder.codemods.xml_transformer import ( ElementAttributeXMLTransformer, @@ -127,38 +128,42 @@ def test_change_only_line_matching_result(self): """ name_attr_map = {"element": {"first": "one"}} - data = { - "runs": [ - { - "results": [ - { - "fingerprints": {"matchBasedId/v1": "123"}, - "locations": [ - { - "ruleId": "rule", - "physicalLocation": { - "artifactLocation": { - "uri": "code.py", - "uriBaseId": "%SRCROOT%", + data = Sarif.model_validate( + { + "runs": [ + { + "tool": {"driver": {"name": "Semgrep OSS"}}, + "results": [ + { + "message": {"text": "message"}, + "fingerprints": {"matchBasedId/v1": "123"}, + "locations": [ + { + "ruleId": "rule", + "physicalLocation": { + "artifactLocation": { + "uri": "code.py", + "uriBaseId": "%SRCROOT%", + }, + "region": { + "snippet": {"text": "snip"}, + "endColumn": 1, + "endLine": 5, + "startColumn": 1, + "startLine": 5, + }, }, - "region": { - "snippet": {"text": "snip"}, - "endColumn": 1, - "endLine": 5, - "startColumn": 1, - "startLine": 5, - }, - }, - } - ], - "ruleId": "rule", - } - ] - } - ] - } - sarif_run = data["runs"] - sarif_results = sarif_run[0]["results"] + } + ], + "ruleId": "rule", + } + ], + } + ] + } + ) + sarif_run = data.runs[0] + sarif_results = sarif_run.results or [] results = [SemgrepResult.from_sarif(sarif_results[0], sarif_run)] self.run_and_assert(name_attr_map, input_code, expected_output, results, True) From 878e206d1245a76ce0511d37c62063d5f9dbe422 Mon Sep 17 00:00:00 2001 From: Daniel D'Avella Date: Wed, 2 Apr 2025 11:43:03 -0400 Subject: [PATCH 4/6] Bump sarif-pydantic dependency to 0.4.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 711ac9b4..02fc480d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "tomlkit~=0.13.0", "wrapt~=1.17.0", "chardet~=5.2.0", - "sarif-pydantic~=0.3.0", + "sarif-pydantic~=0.4.0", "setuptools~=78.1", ] keywords = ["codemod", "codemods", "security", "fix", "fixes"] From 60ca4a8f18674289aaa965d8987614446c6ddb6d Mon Sep 17 00:00:00 2001 From: Daniel D'Avella Date: Wed, 2 Apr 2025 11:45:05 -0400 Subject: [PATCH 5/6] Fix Sonar warnings --- tests/test_codeql.py | 1 - tests/test_xml_transformer.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_codeql.py b/tests/test_codeql.py index 55bb7842..2591c686 100644 --- a/tests/test_codeql.py +++ b/tests/test_codeql.py @@ -36,7 +36,6 @@ def test_from_duplicate_files(codeql_result_set: Path): class TestCodeQLResultSet(TestCase): def test_from_sarif(self): - sarif_file = Path("/path/to/sarif/file.sarif") with tempfile.TemporaryDirectory() as tempdir: sarif_file = Path(tempdir) / "file.sarif" sarif_file.write_text(json.dumps(SARIF_CONTENT)) diff --git a/tests/test_xml_transformer.py b/tests/test_xml_transformer.py index 34956569..f8602806 100644 --- a/tests/test_xml_transformer.py +++ b/tests/test_xml_transformer.py @@ -163,7 +163,8 @@ def test_change_only_line_matching_result(self): } ) sarif_run = data.runs[0] - sarif_results = sarif_run.results or [] + assert sarif_run.results + sarif_results = sarif_run.results results = [SemgrepResult.from_sarif(sarif_results[0], sarif_run)] self.run_and_assert(name_attr_map, input_code, expected_output, results, True) From 2f64d5898894d33108fb53aa8f9594fc2f874d6e Mon Sep 17 00:00:00 2001 From: Daniel D'Avella Date: Thu, 3 Apr 2025 15:53:36 -0400 Subject: [PATCH 6/6] Bump sarif-pydantic to 0.5.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 02fc480d..78dd036b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "tomlkit~=0.13.0", "wrapt~=1.17.0", "chardet~=5.2.0", - "sarif-pydantic~=0.4.0", + "sarif-pydantic~=0.5.0", "setuptools~=78.1", ] keywords = ["codemod", "codemods", "security", "fix", "fixes"]