Skip to content

Commit 3c1cc98

Browse files
committed
Improve detector precision with contextual/entropy gates and reduce secret-scanner false positives
1 parent b0b3eb9 commit 3c1cc98

File tree

8 files changed

+275
-29
lines changed

8 files changed

+275
-29
lines changed

.leaklensignore

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,4 @@
22
examples/generated/**
33
examples/vulnerable_repo/**
44
tests/**
5-
src/leaklens/rules.py
6-
src/leaklens/detectors/context.py
75
*.min.js

src/leaklens/detectors/context.py

Lines changed: 85 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,30 +4,29 @@
44

55
import re
66

7+
from .entropy import shannon_entropy
78
from ..models import DetectionMatch, DetectorSource, Severity
89

910
SUSPICIOUS_NAMES = {
1011
"password",
1112
"passwd",
1213
"secret",
1314
"token",
14-
"key",
15-
"api",
16-
"apikey",
1715
"auth",
1816
"credential",
19-
"private",
17+
"apikey",
2018
}
2119

22-
ASSIGNMENT_PATTERN = re.compile(
23-
r"(?P<name>[A-Za-z_][A-Za-z0-9_\-]*)\s*[:=]\s*(?P<quote>['\"]?)(?P<value>[^'\"\n#]{6,})(?P=quote)"
20+
QUOTED_ASSIGNMENT_PATTERN = re.compile(
21+
r"(?P<name>[A-Za-z_][A-Za-z0-9_\-]*)\s*[:=]\s*(?P<quote>['\"])(?P<value>[^'\"\n#]{6,})(?P=quote)"
2422
)
23+
UNQUOTED_ENV_PATTERN = re.compile(r"(?P<name>[A-Za-z_][A-Za-z0-9_\-]*)\s*[:=]\s*(?P<value>[^\s#]{12,})\s*$")
2524

2625
CONNECTION_STRING_PATTERN = re.compile(
2726
r"(?i)\b(?:postgres(?:ql)?|mysql|mongodb(?:\+srv)?|redis|amqp|mssql)://[^\s:/]+:[^\s@/]+@[^\s]+"
2827
)
2928

30-
AUTH_CONTEXT_PATTERN = re.compile(r"(?i)\b(auth|login|security|credential|config)\b")
29+
AUTH_CONTEXT_PATTERN = re.compile(r"(?i)\b(auth|login|security|credential|bearer|jwt|oauth|session)\b")
3130

3231
PLACEHOLDERS = {"changeme", "example", "sample", "test", "dummy", "password", "secret"}
3332
NAME_SPLIT_PATTERN = re.compile(r"[^A-Za-z0-9]+")
@@ -41,6 +40,10 @@ def scan_line(self, file_path: str, line_number: int, line: str) -> list[Detecti
4140
del line_number
4241
hits: list[DetectionMatch] = []
4342
lowered = line.lower()
43+
stripped = line.strip()
44+
45+
if re.match(r"^\s*(#|//|/\*|\*)", stripped):
46+
return []
4447

4548
for match in CONNECTION_STRING_PATTERN.finditer(line):
4649
value = match.group(0)
@@ -66,7 +69,7 @@ def scan_line(self, file_path: str, line_number: int, line: str) -> list[Detecti
6669
)
6770
)
6871

69-
for match in ASSIGNMENT_PATTERN.finditer(line):
72+
for match in QUOTED_ASSIGNMENT_PATTERN.finditer(line):
7073
name = match.group("name")
7174
value = match.group("value").strip()
7275
if _should_skip_assignment_value(value):
@@ -101,29 +104,64 @@ def scan_line(self, file_path: str, line_number: int, line: str) -> list[Detecti
101104
)
102105
)
103106

107+
if file_path.endswith(".env"):
108+
for match in UNQUOTED_ENV_PATTERN.finditer(line):
109+
name = match.group("name")
110+
value = match.group("value").strip()
111+
if _should_skip_assignment_value(value):
112+
continue
113+
if not _is_suspicious_name(name):
114+
continue
115+
if len(value) < 16:
116+
continue
117+
118+
hits.append(
119+
DetectionMatch(
120+
finding_type="Suspicious Hardcoded Credential",
121+
value=value,
122+
start=match.start("value"),
123+
end=match.end("value"),
124+
source=DetectorSource.CONTEXT,
125+
confidence=0.74,
126+
severity=Severity.HIGH,
127+
risk="Hardcoded credentials are often propagated to logs, forks, and artifacts.",
128+
remediation="Replace literal with an environment variable and rotate if already exposed.",
129+
safer_alternative=(
130+
"Use os.getenv() in Python or process.env in Node with secret manager injection."
131+
),
132+
autofix=_autofix_for_assignment(file_path, name),
133+
)
134+
)
135+
104136
if AUTH_CONTEXT_PATTERN.search(lowered):
105137
literals = _extract_string_literals(line)
106138
for value, start, end in literals:
107-
if len(value) < 12 or _is_placeholder(value):
139+
if len(value) < 16 or _is_placeholder(value):
140+
continue
141+
if _should_skip_assignment_value(value):
108142
continue
109-
if _has_diverse_chars(value):
110-
hits.append(
111-
DetectionMatch(
112-
finding_type="Auth Context Secret Literal",
113-
value=value,
114-
start=start,
115-
end=end,
116-
source=DetectorSource.CONTEXT,
117-
confidence=0.62,
118-
severity=Severity.MEDIUM,
119-
risk="Sensitive literals near auth/security code are likely real secrets.",
120-
remediation="Move this literal to secrets storage and reference by env variable.",
121-
safer_alternative=(
122-
"Use runtime secret injection and avoid embedding values in repository code."
123-
),
124-
autofix="Replace literal with os.getenv(\"AUTH_SECRET\") and define .env.example entry.",
125-
)
143+
if shannon_entropy(value) < 3.4:
144+
continue
145+
if not _looks_secret_like_literal(value):
146+
continue
147+
148+
hits.append(
149+
DetectionMatch(
150+
finding_type="Auth Context Secret Literal",
151+
value=value,
152+
start=start,
153+
end=end,
154+
source=DetectorSource.CONTEXT,
155+
confidence=0.65,
156+
severity=Severity.MEDIUM,
157+
risk="Sensitive literals near auth/security code are likely real secrets.",
158+
remediation="Move this literal to secrets storage and reference by env variable.",
159+
safer_alternative=(
160+
"Use runtime secret injection and avoid embedding values in repository code."
161+
),
162+
autofix="Replace literal with os.getenv(\"AUTH_SECRET\") and define .env.example entry.",
126163
)
164+
)
127165

128166
return hits
129167

@@ -158,6 +196,12 @@ def _is_suspicious_name(name: str) -> bool:
158196
return True
159197
if {"private", "key"}.issubset(token_set):
160198
return True
199+
if {"access", "key"}.issubset(token_set):
200+
return True
201+
if {"secret", "key"}.issubset(token_set):
202+
return True
203+
if {"client", "secret"}.issubset(token_set):
204+
return True
161205

162206
return any(token in SUSPICIOUS_NAMES for token in token_set)
163207

@@ -178,6 +222,8 @@ def _should_skip_assignment_value(value: str) -> bool:
178222
return True
179223
if stripped.isdigit():
180224
return True
225+
if "(" in stripped or ")" in stripped:
226+
return True
181227
if lowered.startswith(("http://", "https://", "file://")):
182228
return True
183229
if lowered.startswith(("./", "../", "/")):
@@ -187,6 +233,19 @@ def _should_skip_assignment_value(value: str) -> bool:
187233
return False
188234

189235

236+
def _looks_secret_like_literal(value: str) -> bool:
237+
if len(value) < 16:
238+
return False
239+
if " " in value:
240+
return False
241+
has_upper = any(char.isupper() for char in value)
242+
has_lower = any(char.islower() for char in value)
243+
has_digit = any(char.isdigit() for char in value)
244+
has_symbol = any(not char.isalnum() for char in value)
245+
score = sum([has_upper, has_lower, has_digit, has_symbol])
246+
return score >= 2
247+
248+
190249
def _to_snake_case(name: str) -> str:
191250
with_boundaries = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", name)
192251
return with_boundaries.replace("-", "_").lower()

src/leaklens/detectors/entropy.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717
r"^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$", re.IGNORECASE
1818
)
1919
HEX_DIGEST_PATTERN = re.compile(r"^[0-9a-f]{32}$|^[0-9a-f]{40}$|^[0-9a-f]{64}$", re.IGNORECASE)
20+
SECRET_CONTEXT_PATTERN = re.compile(
21+
r"(?i)\b(password|passwd|secret|token|api[_-]?key|auth|credential|private[_-]?key|access[_-]?key|bearer|jwt)\b"
22+
)
2023

2124

2225
class EntropyDetector:
@@ -29,6 +32,7 @@ def scan_line(self, file_path: str, line_number: int, line: str) -> list[Detecti
2932
"""Evaluate high-entropy candidate tokens in one line."""
3033
del file_path, line_number
3134
hits: list[DetectionMatch] = []
35+
line_has_context = bool(SECRET_CONTEXT_PATTERN.search(line))
3236

3337
for value, start, end in _extract_candidates(line):
3438
if _skip_candidate(value):
@@ -37,6 +41,8 @@ def scan_line(self, file_path: str, line_number: int, line: str) -> list[Detecti
3741
entropy = shannon_entropy(value)
3842
if entropy < self.threshold:
3943
continue
44+
if not line_has_context and (len(value) < 32 or entropy < (self.threshold + 0.6)):
45+
continue
4046

4147
confidence = min(0.92, 0.48 + (entropy - self.threshold) * 0.18)
4248
severity = Severity.MEDIUM if confidence < 0.8 else Severity.HIGH

src/leaklens/detectors/regex.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,13 @@
22

33
from __future__ import annotations
44

5+
import base64
6+
import json
57
import re
68
from dataclasses import dataclass
79
from pathlib import Path
810

11+
from .entropy import shannon_entropy
912
from ..models import DetectionMatch, DetectorSource, RuleSpec
1013

1114
SAFE_ALT = (
@@ -37,6 +40,7 @@ def scan_line(self, file_path: str, line_number: int, line: str) -> list[Detecti
3740
del line_number
3841
hits: list[DetectionMatch] = []
3942
file_name = Path(file_path).name
43+
line_lower = line.lower()
4044

4145
for compiled in self._rules:
4246
if compiled.spec.name == "dotenv_assignment" and not file_name.startswith(".env"):
@@ -47,6 +51,8 @@ def scan_line(self, file_path: str, line_number: int, line: str) -> list[Detecti
4751
continue
4852
if compiled.spec.name == "dotenv_assignment" and _is_placeholder_value(value):
4953
continue
54+
if not _passes_rule_heuristics(compiled.spec.name, value, line, line_lower):
55+
continue
5056
hits.append(
5157
DetectionMatch(
5258
finding_type=compiled.spec.secret_type,
@@ -104,3 +110,135 @@ def _is_placeholder_value(value: str) -> bool:
104110
"null",
105111
}
106112
return lowered in placeholders or lowered.startswith("example")
113+
114+
115+
def _passes_rule_heuristics(rule_name: str, value: str, line: str, line_lower: str) -> bool:
116+
normalized = value.strip()
117+
if not normalized:
118+
return False
119+
if _contains_placeholder_markers(normalized):
120+
return False
121+
122+
if rule_name == "aws_access_key":
123+
if len(normalized) != 20:
124+
return False
125+
if normalized.upper().endswith("EXAMPLE"):
126+
return False
127+
return _has_any_keyword(line_lower, {"aws", "access_key", "secret_key", "iam", "akia", "asia"})
128+
129+
if rule_name == "aws_secret_key":
130+
if len(normalized) != 40:
131+
return False
132+
return shannon_entropy(normalized) >= 3.4
133+
134+
if rule_name == "github_token":
135+
if len(normalized) < 40:
136+
return False
137+
if not _has_any_keyword(
138+
line_lower,
139+
{"github", "token", "authorization", "bearer", "ghp_", "gho_", "ghu_", "ghs_", "ghr_", "github_pat_"},
140+
):
141+
return False
142+
return shannon_entropy(_strip_prefix(normalized)) >= 3.2
143+
144+
if rule_name == "stripe_secret":
145+
if len(normalized) < 24:
146+
return False
147+
if not _has_any_keyword(line_lower, {"stripe", "sk_live_", "sk_test_", "secret", "token", "api_key"}):
148+
return False
149+
return shannon_entropy(_strip_prefix(normalized)) >= 3.1
150+
151+
if rule_name == "slack_token":
152+
if len(normalized) < 20 or normalized.count("-") < 2:
153+
return False
154+
if not _has_any_keyword(line_lower, {"slack", "xox", "token", "oauth", "bot", "webhook"}):
155+
return False
156+
return shannon_entropy(_strip_prefix(normalized)) >= 3.0
157+
158+
if rule_name == "jwt_token":
159+
if len(normalized) < 80:
160+
return False
161+
if not _has_any_keyword(line_lower, {"jwt", "bearer", "authorization", "id_token", "access_token", "token"}):
162+
return False
163+
if not _looks_like_jwt_header(normalized):
164+
return False
165+
parts = normalized.split(".")
166+
if len(parts) != 3:
167+
return False
168+
return shannon_entropy(parts[1] + parts[2]) >= 3.2
169+
170+
if rule_name == "ssh_private_key":
171+
marker = "-----BEGIN OPENSSH PRIVATE KEY-----"
172+
return _is_unquoted_marker_line(line, marker)
173+
174+
if rule_name == "rsa_private_key":
175+
marker = "-----BEGIN RSA PRIVATE KEY-----"
176+
return _is_unquoted_marker_line(line, marker)
177+
178+
return True
179+
180+
181+
def _strip_prefix(value: str) -> str:
182+
if "_" in value:
183+
return value.split("_", 1)[1]
184+
if "-" in value:
185+
return value.split("-", 1)[1]
186+
return value
187+
188+
189+
def _has_any_keyword(line_lower: str, keywords: set[str]) -> bool:
190+
return any(keyword in line_lower for keyword in keywords)
191+
192+
193+
def _contains_placeholder_markers(value: str) -> bool:
194+
lowered = value.lower()
195+
markers = {
196+
"example",
197+
"placeholder",
198+
"dummy",
199+
"changeme",
200+
"replace_me",
201+
"your_token_here",
202+
"your-key",
203+
"fake",
204+
"testtoken",
205+
"redacted",
206+
}
207+
return any(marker in lowered for marker in markers)
208+
209+
210+
def _is_unquoted_marker_line(line: str, marker: str) -> bool:
211+
stripped = line.strip()
212+
if not stripped.startswith(marker):
213+
return False
214+
if stripped != marker:
215+
return False
216+
return '"' not in stripped and "'" not in stripped
217+
218+
219+
def _looks_like_jwt_header(token: str) -> bool:
220+
parts = token.split(".")
221+
if len(parts) != 3:
222+
return False
223+
decoded = _decode_base64url(parts[0])
224+
if decoded is None:
225+
return False
226+
try:
227+
payload = json.loads(decoded)
228+
except json.JSONDecodeError:
229+
return False
230+
if not isinstance(payload, dict):
231+
return False
232+
return "alg" in payload or "typ" in payload
233+
234+
235+
def _decode_base64url(segment: str) -> str | None:
236+
padding = "=" * (-len(segment) % 4)
237+
try:
238+
raw = base64.urlsafe_b64decode(segment + padding)
239+
except (ValueError, TypeError):
240+
return None
241+
try:
242+
return raw.decode("utf-8")
243+
except UnicodeDecodeError:
244+
return None

src/leaklens/rules.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
RuleSpec(
2828
name="github_token",
2929
secret_type="GitHub Token",
30-
pattern=r"\bgh[pousr]_[A-Za-z0-9]{36,255}\b",
30+
pattern=r"\b(?:gh[pousr]_[A-Za-z0-9]{36,255}|github_pat_[A-Za-z0-9_]{20,255})\b",
3131
severity=Severity.HIGH,
3232
confidence=0.95,
3333
risk="GitHub tokens can be used to access repositories, secrets, and workflows.",

0 commit comments

Comments
 (0)