Skip to content

Commit 7e5a741

Browse files
authored
Avoid duplicate findings from multiple sarif files (#927)
* Add example CodeQL SARIF file * Prevent duplicate findings from multiple sarif files * Account for frozen datatypes when updating finding metadata * Avoid warnings when running test pipeline
1 parent 39776ac commit 7e5a741

File tree

11 files changed

+448
-316
lines changed

11 files changed

+448
-316
lines changed

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ readme = "README.md"
1111
license = {file = "LICENSE"}
1212
description = "A pluggable framework for building codemods in Python"
1313
dependencies = [
14+
"boltons~=21.0.0",
1415
"GitPython<4",
1516
"isort>=5.12,<5.14",
1617
"libcst>=1.1,<1.6",
@@ -118,6 +119,7 @@ version_file = "src/codemodder/_version.py"
118119
[tool.pytest.ini_options]
119120
# Ignore integration tests and ci tests by default
120121
testpaths = ["tests"]
122+
asyncio_default_fixture_loop_scope = "function"
121123

122124
[tool.black]
123125
extend-exclude = '''

src/codemodder/codetf.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,16 @@ def validate_description(self):
7676
raise ValueError("description must not be empty")
7777
return self
7878

79+
def with_findings(self, findings: list[Finding] | None) -> Change:
80+
return Change(
81+
lineNumber=self.lineNumber,
82+
description=self.description,
83+
diffSide=self.diffSide,
84+
properties=self.properties,
85+
packageActions=self.packageActions,
86+
findings=findings,
87+
)
88+
7989

8090
class AIMetadata(BaseModel):
8191
provider: Optional[str] = None
@@ -99,6 +109,16 @@ class ChangeSet(BaseModel):
99109
strategy: Optional[Strategy] = None
100110
provisional: Optional[bool] = False
101111

112+
def with_changes(self, changes: list[Change]) -> ChangeSet:
113+
return ChangeSet(
114+
path=self.path,
115+
diff=self.diff,
116+
changes=changes,
117+
ai=self.ai,
118+
strategy=self.strategy,
119+
provisional=self.provisional,
120+
)
121+
102122

103123
class Reference(BaseModel):
104124
url: str
@@ -115,11 +135,17 @@ class Rule(BaseModel):
115135
name: str
116136
url: Optional[str] = None
117137

138+
class Config:
139+
frozen = True
140+
118141

119142
class Finding(BaseModel):
120143
id: str
121144
rule: Rule
122145

146+
class Config:
147+
frozen = True
148+
123149
def to_unfixed_finding(
124150
self,
125151
*,
@@ -135,6 +161,12 @@ def to_unfixed_finding(
135161
reason=reason,
136162
)
137163

164+
def with_rule(self, name: str, url: Optional[str]) -> Finding:
165+
return Finding(
166+
id=self.id,
167+
rule=Rule(id=self.rule.id, name=name, url=url),
168+
)
169+
138170

139171
class UnfixedFinding(Finding):
140172
path: str

src/codemodder/result.py

Lines changed: 46 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@
44
from abc import abstractmethod
55
from dataclasses import dataclass, field
66
from pathlib import Path
7-
from typing import TYPE_CHECKING, Any, ClassVar, Type
7+
from typing import TYPE_CHECKING, Any, ClassVar, Sequence, Type
88

99
import libcst as cst
10+
from boltons.setutils import IndexedSet
1011
from libcst._position import CodeRange
1112
from typing_extensions import Self
1213

@@ -18,39 +19,40 @@
1819
from codemodder.context import CodemodExecutionContext
1920

2021

21-
@dataclass
22+
@dataclass(frozen=True)
2223
class LineInfo:
2324
line: int
2425
column: int = -1
2526
snippet: str | None = None
2627

2728

28-
@dataclass
29+
@dataclass(frozen=True)
2930
class Location(ABCDataclass):
3031
file: Path
3132
start: LineInfo
3233
end: LineInfo
3334

3435

36+
@dataclass(frozen=True)
3537
class SarifLocation(Location):
3638
@classmethod
3739
@abstractmethod
3840
def from_sarif(cls, sarif_location) -> Self:
3941
pass
4042

4143

42-
@dataclass
44+
@dataclass(frozen=True)
4345
class LocationWithMessage:
4446
location: Location
4547
message: str
4648

4749

48-
@dataclass(kw_only=True)
50+
@dataclass(frozen=True, kw_only=True)
4951
class Result(ABCDataclass):
5052
rule_id: str
51-
locations: list[Location]
52-
codeflows: list[list[Location]] = field(default_factory=list)
53-
related_locations: list[LocationWithMessage] = field(default_factory=list)
53+
locations: Sequence[Location]
54+
codeflows: Sequence[Sequence[Location]] = field(default_factory=tuple)
55+
related_locations: Sequence[LocationWithMessage] = field(default_factory=tuple)
5456
finding: Finding | None = None
5557

5658
def match_location(self, pos: CodeRange, node: cst.CSTNode) -> bool:
@@ -67,13 +69,16 @@ def match_location(self, pos: CodeRange, node: cst.CSTNode) -> bool:
6769
for location in self.locations
6870
)
6971

72+
def __hash__(self):
73+
return hash(self.rule_id)
7074

71-
@dataclass(kw_only=True)
75+
76+
@dataclass(frozen=True, kw_only=True)
7277
class SASTResult(Result):
7378
finding_id: str
7479

7580

76-
@dataclass(kw_only=True)
81+
@dataclass(frozen=True, kw_only=True)
7782
class SarifResult(SASTResult, ABCDataclass):
7883
location_type: ClassVar[Type[SarifLocation]]
7984

@@ -84,32 +89,40 @@ def from_sarif(
8489
raise NotImplementedError
8590

8691
@classmethod
87-
def extract_locations(cls, sarif_result) -> list[Location]:
88-
return [
89-
cls.location_type.from_sarif(location)
90-
for location in sarif_result["locations"]
91-
]
92+
def extract_locations(cls, sarif_result) -> Sequence[Location]:
93+
return tuple(
94+
[
95+
cls.location_type.from_sarif(location)
96+
for location in sarif_result["locations"]
97+
]
98+
)
9299

93100
@classmethod
94-
def extract_related_locations(cls, sarif_result) -> list[LocationWithMessage]:
95-
return [
96-
LocationWithMessage(
97-
message=rel_location.get("message", {}).get("text", ""),
98-
location=cls.location_type.from_sarif(rel_location),
99-
)
100-
for rel_location in sarif_result.get("relatedLocations", [])
101-
]
101+
def extract_related_locations(cls, sarif_result) -> Sequence[LocationWithMessage]:
102+
return tuple(
103+
[
104+
LocationWithMessage(
105+
message=rel_location.get("message", {}).get("text", ""),
106+
location=cls.location_type.from_sarif(rel_location),
107+
)
108+
for rel_location in sarif_result.get("relatedLocations", [])
109+
]
110+
)
102111

103112
@classmethod
104-
def extract_code_flows(cls, sarif_result) -> list[list[Location]]:
105-
return [
113+
def extract_code_flows(cls, sarif_result) -> Sequence[Sequence[Location]]:
114+
return tuple(
106115
[
107-
cls.location_type.from_sarif(locations.get("location"))
108-
for locations in threadflow.get("locations", {})
116+
tuple(
117+
[
118+
cls.location_type.from_sarif(locations.get("location"))
119+
for locations in threadflow.get("locations", {})
120+
]
121+
)
122+
for codeflow in sarif_result.get("codeFlows", {})
123+
for threadflow in codeflow.get("threadFlows", {})
109124
]
110-
for codeflow in sarif_result.get("codeFlows", {})
111-
for threadflow in codeflow.get("threadFlows", {})
112-
]
125+
)
113126

114127
@classmethod
115128
def extract_rule_id(cls, result, sarif_run, truncate_rule_id: bool = False) -> str:
@@ -199,5 +212,7 @@ def list_dict_or(
199212
) -> dict[Any, list[Any]]:
200213
result_dict = {}
201214
for k in other.keys() | dictionary.keys():
202-
result_dict[k] = dictionary.get(k, []) + other.get(k, [])
215+
result_dict[k] = list(
216+
IndexedSet(dictionary.get(k, [])) | (IndexedSet(other.get(k, [])))
217+
)
203218
return result_dict

src/codemodder/utils/abc_dataclass.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from dataclasses import dataclass
33

44

5-
@dataclass
5+
@dataclass(frozen=True)
66
class ABCDataclass(ABC):
77
"""Inspired by https://stackoverflow.com/a/60669138"""
88

src/codemodder/utils/update_finding_metadata.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
if typing.TYPE_CHECKING:
66
from codemodder.codemods.base_codemod import ToolRule
77

8-
from codemodder.codetf import ChangeSet
8+
from codemodder.codetf import Change, ChangeSet
99

1010

1111
def update_finding_metadata(
@@ -15,12 +15,23 @@ def update_finding_metadata(
1515
if not (tool_rule_map := {rule.id: (rule.name, rule.url) for rule in tool_rules}):
1616
return changesets
1717

18+
new_changesets: list[ChangeSet] = []
1819
for changeset in changesets:
20+
new_changes: list[Change] = []
1921
for change in changeset.changes:
20-
for finding in change.findings or []:
21-
if finding.id in tool_rule_map:
22-
finding.rule.name = tool_rule_map[finding.id][0]
23-
finding.rule.url = tool_rule_map[finding.id][1]
22+
new_changes.append(
23+
change.with_findings(
24+
[
25+
(
26+
finding.with_rule(*tool_rule_map[finding.rule.id])
27+
if finding.rule.id in tool_rule_map
28+
else finding
29+
)
30+
for finding in change.findings or []
31+
]
32+
or None
33+
)
34+
)
35+
new_changesets.append(changeset.with_changes(new_changes))
2436

25-
# TODO: eventually make this functional and return a new list
26-
return changesets
37+
return new_changesets

src/core_codemods/defectdojo/results.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def from_result(cls, result: dict) -> Self:
2727
return cls(
2828
finding_id=result["id"],
2929
rule_id=result["title"],
30-
locations=[DefectDojoLocation.from_result(result)],
30+
locations=tuple([DefectDojoLocation.from_result(result)]),
3131
finding=Finding(
3232
id=str(result["id"]),
3333
rule=Rule(

src/core_codemods/sonar/results.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from dataclasses import replace
33
from functools import cache
44
from pathlib import Path
5+
from typing import Sequence
56

67
import libcst as cst
78
from typing_extensions import Self
@@ -40,18 +41,22 @@ def from_result(cls, result: dict) -> Self:
4041
if not (rule_id := result.get("rule", None) or result.get("ruleKey", None)):
4142
raise ValueError("Could not extract rule id from sarif result.")
4243

43-
locations: list[Location] = (
44+
locations: Sequence[Location] = tuple(
4445
[SonarLocation.from_json_location(result)]
4546
if result.get("textRange")
4647
else []
4748
)
48-
all_flows: list[list[Location]] = [
49+
all_flows: Sequence[Sequence[Location]] = tuple(
4950
[
50-
SonarLocation.from_json_location(json_location)
51-
for json_location in flow.get("locations", {})
51+
tuple(
52+
[
53+
SonarLocation.from_json_location(json_location)
54+
for json_location in flow.get("locations", {})
55+
]
56+
)
57+
for flow in result.get("flows", [])
5258
]
53-
for flow in result.get("flows", [])
54-
]
59+
)
5560

5661
finding_id = result.get("key", rule_id)
5762

tests/samples/codeql/python/vulnerable-code-snippets.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)