Skip to content

Commit 66b796d

Browse files
authored
Add detection and processing for external Semgrep SARIF files (#459)
* Refactor semgrep SARIFs into semgrep module * Add SARIF detector plugin for semgrep * Add semgrep SARIF file detection and processing
1 parent 9c2836e commit 66b796d

File tree

9 files changed

+31978
-72
lines changed

9 files changed

+31978
-72
lines changed

.pre-commit-config.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@ repos:
1414
tests/.*
1515
)$
1616
- id: check-added-large-files
17+
exclude: |
18+
(?x)^(
19+
tests/samples/pygoat.semgrep.sarif.json
20+
)$
1721
- repo: https://github.com/psf/black
1822
rev: 24.3.0
1923
hooks:

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,9 @@ core = "core_codemods:registry"
8383
sonar = "core_codemods:sonar_registry"
8484
defectdojo = "core_codemods:defectdojo_registry"
8585

86+
[project.entry-points.sarif_detectors]
87+
"semgrep" = "codemodder.semgrep:SemgrepSarifToolDetector"
88+
8689
[tool.setuptools]
8790

8891
[tool.setuptools.package-data]

src/codemodder/codemods/semgrep.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
import io
22
import os
33
import tempfile
4+
from functools import cache
45
from pathlib import Path
56

67
import yaml
78

89
from codemodder.codemods.base_detector import BaseDetector
910
from codemodder.context import CodemodExecutionContext
1011
from codemodder.result import ResultSet
12+
from codemodder.semgrep import SemgrepResultSet
1113
from codemodder.semgrep import run as semgrep_run
1214

1315

@@ -47,3 +49,25 @@ def apply(
4749
yaml_files = self.get_yaml_files(codemod_id)
4850
with context.timer.measure("semgrep"):
4951
return semgrep_run(context, yaml_files, files_to_analyze)
52+
53+
54+
class SemgrepSarifFileDetector(BaseDetector):
55+
def apply(
56+
self,
57+
codemod_id: str,
58+
context: CodemodExecutionContext,
59+
files_to_analyze: list[Path],
60+
) -> ResultSet:
61+
del codemod_id
62+
del files_to_analyze
63+
return process_semgrep_findings(
64+
tuple(context.tool_result_files_map.get("semgrep", ()))
65+
) # Convert list to tuple for cache hashability
66+
67+
68+
@cache
69+
def process_semgrep_findings(semgrep_sarif_files: tuple[str]) -> ResultSet:
70+
results = SemgrepResultSet()
71+
for file in semgrep_sarif_files or ():
72+
results |= SemgrepResultSet.from_sarif(file)
73+
return results

src/codemodder/sarifs.py

Lines changed: 1 addition & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,10 @@
33
from collections import defaultdict
44
from importlib.metadata import entry_points
55
from pathlib import Path
6-
from typing import DefaultDict, Optional
7-
8-
from typing_extensions import Self
6+
from typing import DefaultDict
97

108
from codemodder.logging import logger
119

12-
from .result import LineInfo, Location, Result, ResultSet
13-
1410

1511
class AbstractSarifToolDetector(metaclass=ABCMeta):
1612
@classmethod
@@ -39,65 +35,3 @@ def detect_sarif_tools(filenames: list[Path]) -> DefaultDict[str, list[str]]:
3935
continue
4036

4137
return results
42-
43-
44-
def extract_rule_id(result, sarif_run) -> Optional[str]:
45-
if "ruleId" in result:
46-
# semgrep preprends the folders into the rule-id, we want the base name only
47-
return result["ruleId"].rsplit(".")[-1]
48-
49-
# it may be contained in the 'rule' field through the tool component in the sarif file
50-
if "rule" in result:
51-
tool_index = result["rule"]["toolComponent"]["index"]
52-
rule_index = result["rule"]["index"]
53-
return sarif_run["tool"]["extensions"][tool_index]["rules"][rule_index]["id"]
54-
55-
return None
56-
57-
58-
# NOTE: These Sarif classes are actually specific to Semgrep and should be moved elsewhere
59-
class SarifLocation(Location):
60-
@classmethod
61-
def from_sarif(cls, sarif_location) -> Self:
62-
artifact_location = sarif_location["physicalLocation"]["artifactLocation"]
63-
file = Path(artifact_location["uri"])
64-
start = LineInfo(
65-
line=sarif_location["physicalLocation"]["region"]["startLine"],
66-
column=sarif_location["physicalLocation"]["region"]["startColumn"],
67-
snippet=sarif_location["physicalLocation"]["region"]["snippet"]["text"],
68-
)
69-
end = LineInfo(
70-
line=sarif_location["physicalLocation"]["region"]["endLine"],
71-
column=sarif_location["physicalLocation"]["region"]["endColumn"],
72-
snippet=sarif_location["physicalLocation"]["region"]["snippet"]["text"],
73-
)
74-
return cls(file=file, start=start, end=end)
75-
76-
77-
class SarifResult(Result):
78-
@classmethod
79-
def from_sarif(cls, sarif_result, sarif_run) -> Self:
80-
rule_id = extract_rule_id(sarif_result, sarif_run)
81-
if not rule_id:
82-
raise ValueError("Could not extract rule id from sarif result.")
83-
84-
locations: list[Location] = []
85-
for location in sarif_result["locations"]:
86-
artifact_location = SarifLocation.from_sarif(location)
87-
locations.append(artifact_location)
88-
return cls(rule_id=rule_id, locations=locations)
89-
90-
91-
class SarifResultSet(ResultSet):
92-
@classmethod
93-
def from_sarif(cls, sarif_file: str | Path) -> Self:
94-
with open(sarif_file, "r", encoding="utf-8") as f:
95-
data = json.load(f)
96-
97-
result_set = cls()
98-
for sarif_run in data["runs"]:
99-
for result in sarif_run["results"]:
100-
sarif_result = SarifResult.from_sarif(result, sarif_run)
101-
result_set.add_result(sarif_result)
102-
103-
return result_set

src/codemodder/semgrep.py

Lines changed: 77 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,93 @@
11
import itertools
2+
import json
23
import subprocess
34
from pathlib import Path
45
from tempfile import NamedTemporaryFile
56
from typing import Iterable, Optional
67

8+
from typing_extensions import Self
9+
710
from codemodder.context import CodemodExecutionContext
811
from codemodder.logging import logger
9-
from codemodder.sarifs import SarifResultSet
12+
from codemodder.result import LineInfo, Location, Result, ResultSet
13+
from codemodder.sarifs import AbstractSarifToolDetector
14+
15+
16+
class SemgrepSarifToolDetector(AbstractSarifToolDetector):
17+
@classmethod
18+
def detect(cls, run_data: dict) -> bool:
19+
return (
20+
"tool" in run_data
21+
and "semgrep" in run_data["tool"]["driver"]["name"].lower()
22+
)
23+
24+
25+
def extract_rule_id(result, sarif_run) -> Optional[str]:
26+
if "ruleId" in result:
27+
# semgrep preprends the folders into the rule-id, we want the base name only
28+
return result["ruleId"].rsplit(".")[-1]
29+
30+
# it may be contained in the 'rule' field through the tool component in the sarif file
31+
if "rule" in result:
32+
tool_index = result["rule"]["toolComponent"]["index"]
33+
rule_index = result["rule"]["index"]
34+
return sarif_run["tool"]["extensions"][tool_index]["rules"][rule_index]["id"]
35+
36+
return None
37+
38+
39+
class SemgrepLocation(Location):
40+
@classmethod
41+
def from_sarif(cls, sarif_location) -> Self:
42+
artifact_location = sarif_location["physicalLocation"]["artifactLocation"]
43+
file = Path(artifact_location["uri"])
44+
start = LineInfo(
45+
line=sarif_location["physicalLocation"]["region"]["startLine"],
46+
column=sarif_location["physicalLocation"]["region"]["startColumn"],
47+
snippet=sarif_location["physicalLocation"]["region"]["snippet"]["text"],
48+
)
49+
end = LineInfo(
50+
line=sarif_location["physicalLocation"]["region"]["endLine"],
51+
column=sarif_location["physicalLocation"]["region"]["endColumn"],
52+
snippet=sarif_location["physicalLocation"]["region"]["snippet"]["text"],
53+
)
54+
return cls(file=file, start=start, end=end)
55+
56+
57+
class SemgrepResult(Result):
58+
@classmethod
59+
def from_sarif(cls, sarif_result, sarif_run) -> Self:
60+
rule_id = extract_rule_id(sarif_result, sarif_run)
61+
if not rule_id:
62+
raise ValueError("Could not extract rule id from sarif result.")
63+
64+
locations: list[Location] = []
65+
for location in sarif_result["locations"]:
66+
artifact_location = SemgrepLocation.from_sarif(location)
67+
locations.append(artifact_location)
68+
return cls(rule_id=rule_id, locations=locations)
69+
70+
71+
class SemgrepResultSet(ResultSet):
72+
@classmethod
73+
def from_sarif(cls, sarif_file: str | Path) -> Self:
74+
with open(sarif_file, "r", encoding="utf-8") as f:
75+
data = json.load(f)
76+
77+
result_set = cls()
78+
for sarif_run in data["runs"]:
79+
for result in sarif_run["results"]:
80+
sarif_result = SemgrepResult.from_sarif(result, sarif_run)
81+
result_set.add_result(sarif_result)
82+
83+
return result_set
1084

1185

1286
def run(
1387
execution_context: CodemodExecutionContext,
1488
yaml_files: Iterable[Path],
1589
files_to_analyze: Optional[Iterable[Path]] = None,
16-
) -> SarifResultSet:
90+
) -> SemgrepResultSet:
1791
"""
1892
Runs Semgrep and outputs a dict with the results organized by rule_id.
1993
"""
@@ -49,5 +123,5 @@ def run(
49123
if not execution_context.verbose:
50124
logger.error("captured semgrep stderr: %s", call.stderr)
51125
raise subprocess.CalledProcessError(call.returncode, command)
52-
results = SarifResultSet.from_sarif(temp_sarif_file.name)
126+
results = SemgrepResultSet.from_sarif(temp_sarif_file.name)
53127
return results

src/core_codemods/sonar/api.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ def apply(
5959
context: CodemodExecutionContext,
6060
files_to_analyze: list[Path],
6161
) -> ResultSet:
62+
del codemod_id
63+
del files_to_analyze
6264
sonar_findings = process_sonar_findings(
6365
tuple(
6466
context.tool_result_files_map.get("sonar", ())

0 commit comments

Comments
 (0)