Skip to content

Commit e30c3a7

Browse files
authored
YAML safe load and assert removing (#808)
* YAML safe load and assert removing * PNG scanner fix: iTXt * version up to 1.14.6 * Applied ML for DOC rules * Amazon Bedrock API Key
1 parent 5f6278f commit e30c3a7

File tree

21 files changed

+1022
-403
lines changed

21 files changed

+1022
-403
lines changed

.github/workflows/check.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ jobs:
9292
run: |
9393
banner="$(python -m credsweeper --banner | head -1)"
9494
echo "banner = '${banner}'"
95-
if [ "CredSweeper 1.14.5 crc32:da87b2ca" != "${banner}" ]; then
95+
if [ "CredSweeper 1.14.6 crc32:765e27c6" != "${banner}" ]; then
9696
echo "Update the check for '${banner}'"
9797
exit 1
9898
fi

credsweeper/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,4 @@
2424
"__version__"
2525
]
2626

27-
__version__ = "1.14.5"
27+
__version__ = "1.14.6"

credsweeper/deep_scanner/png_scanner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def yield_png_chunks(data: bytes) -> Generator[Tuple[int, str, bytes], None, Non
5757
else:
5858
raise ValueError(f"Unsupported compression {repr(itxt_data[:2])}")
5959
lang_tag, itxt_data = itxt_data[2:].split(b'\0', 1)
60-
trans_key, itxt_data = itxt_data[2:].split(b'\0', 1)
60+
trans_key, itxt_data = itxt_data.split(b'\0', 1)
6161
yield (offset, f"PNG_ITXT_{'1' if compression else '0'}"
6262
f":{keyword.decode(encoding=UTF_8)}"
6363
f":{lang_tag.decode(encoding=UTF_8)}"

credsweeper/file_handler/data_content_provider.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def represent_as_structure(self) -> Optional[bool]:
125125
# # # YAML - almost always recognized
126126
try:
127127
if ':' in self.text and (2 < self.text.count('\n') or 2 < self.text.count('\r')):
128-
self.structure = yaml.load(self.text, Loader=yaml.FullLoader)
128+
self.structure = yaml.safe_load(self.text)
129129
logger.debug("CONVERTED from yaml")
130130
else:
131131
logger.debug("Data do not contain colon mark - weak YAML")

credsweeper/ml_model/features/entropy_evaluation.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,26 +20,23 @@ class EntropyEvaluation(Feature):
2020
2121
"""
2222

23-
def __init__(self) -> None:
24-
"""Class initializer"""
25-
super().__init__()
26-
# Max size of ML analyzed value is ML_HUNK but value may be bigger
27-
self.hunk_size = 4 * ML_HUNK
28-
self.log2_cache: Dict[int, float] = {x: math.log2(x) for x in range(4, self.hunk_size + 1)}
29-
self.char_sets: List[Set[str]] = [set(x.value) for x in Chars]
23+
# Max size of ML analyzed value is ML_HUNK but value may be bigger
24+
HUNK_SIZE = 4 * ML_HUNK
25+
LOG2_CACHE: Dict[int, float] = {x: math.log2(x) for x in range(4, 4 * ML_HUNK + 1)}
26+
CHAR_SET: List[Set[str]] = [set(x.value) for x in Chars]
27+
RESULT_SIZE = 3 + len(Chars)
3028

3129
def extract(self, candidate: Candidate) -> np.ndarray:
3230
"""Returns real entropy and possible sets of characters"""
3331
# only head of value will be analyzed
34-
result: np.ndarray = np.zeros(shape=3 + len(self.char_sets), dtype=np.float32)
35-
value = candidate.line_data_list[0].value[:self.hunk_size]
32+
result: np.ndarray = np.zeros(shape=EntropyEvaluation.RESULT_SIZE, dtype=np.float32)
33+
value = candidate.line_data_list[0].value[:EntropyEvaluation.HUNK_SIZE]
3634
size = len(value)
3735
uniq, counts = np.unique(list(value), return_counts=True)
3836
if MIN_DATA_LEN <= size:
3937
# evaluate the entropy for a value of at least 4
4038
probabilities = counts / size
41-
hartley_entropy = self.log2_cache.get(size, -1.0)
42-
assert hartley_entropy, str(candidate)
39+
hartley_entropy = EntropyEvaluation.LOG2_CACHE.get(size, -1.0)
4340

4441
# renyi_entropy alpha=0.5
4542
sum_prob_05 = np.sum(probabilities**0.5)
@@ -59,7 +56,7 @@ def extract(self, candidate: Candidate) -> np.ndarray:
5956
# check charset for non-zero value
6057
# use the new variable to deal with mypy
6158
uniq_set = set(uniq)
62-
for n, i in enumerate(self.char_sets, start=3):
59+
for n, i in enumerate(EntropyEvaluation.CHAR_SET, start=3):
6360
if not uniq_set.difference(i):
6461
result[n] = 1.0
6562

credsweeper/rules/config.yaml

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@
6868
confidence: moderate
6969
type: pattern
7070
values:
71-
- (?P<variable>[\"'`]?(?i:token|secret|key|키|암호화?|토큰)[\"'`]?)((\s)*[=:](\s)*)(?P<quote>[\"'`(])?(?P<value>(?-i:(?P<a>[A-Z])|(?P<b>[a-z])|(?P<c>[0-9/_+=~!@#$%^&*;:?-])){8,80}(?(a)(?(b)(?(c)(\S|$)|(?!x)x)|(?!x)x)|(?!x)x))(?(quote)[)\"'`])
71+
- (?P<variable>[\"'`]?(?i:token|secret|key|키|암호화?|토큰)[\"'`]?)((\s)*(?P<separator>설정은|:=|:(?!:)|=(>|&gt;|(\\\\*u00|%)26gt;)|!==|!=|===|==|=~|=|%3[Dd])(\s)*)(?P<quote>[\"'`(])?(?P<value>(?-i:(?P<a>[A-Z])|(?P<b>[a-z])|(?P<c>[0-9/_+=~!@#$%^&*;:?-])){8,80}(?(a)(?(b)(?(c)(\S|$)|(?!x)x)|(?!x)x)|(?!x)x))(?(quote)[)\"'`])
7272
filter_type:
7373
- ValueAllowlistCheck
7474
- ValuePatternCheck(4)
@@ -84,13 +84,14 @@
8484
- 토큰
8585
target:
8686
- doc
87+
use_ml: true
8788

8889
- name: PASSWD_PAIR
8990
severity: medium
9091
confidence: moderate
9192
type: pattern
9293
values:
93-
- (?P<variable>[\"'`]?(?i:(?<!id[ :/])pa[as]swo?r?ds?|pwd?|p/w|비밀번호|비번|패스워드|암호)[\"'`]?)((\s)*[=:](\s)*)(?P<quote>[\"'`(])?(?P<value>(?-i:(?P<a>[A-Z])|(?P<b>[a-z])|(?P<c>[0-9/_+=~!@#$%^&*;:?-])){8,64}(?(a)(?(b)(?(c)(\S|$)|(?!x)x)|(?!x)x)|(?!x)x))(?(quote)[)\"'`])
94+
- (?P<variable>[\"'`]?(?i:(?<!id[ :/])pa[as]swo?r?ds?|pwd?|p/w|비밀번호|비번|패스워드|암호)[\"'`]?)((\s)*(?P<separator>설정은|:=|:(?!:)|=(>|&gt;|(\\\\*u00|%)26gt;)|!==|!=|===|==|=~|=|%3[Dd])(\s)*)(?P<quote>[\"'`(])?(?P<value>(?-i:(?P<a>[A-Z])|(?P<b>[a-z])|(?P<c>[0-9/_+=~!@#$%^&*;:?-])){8,64}(?(a)(?(b)(?(c)(\S|$)|(?!x)x)|(?!x)x)|(?!x)x))(?(quote)[)\"'`])
9495
filter_type:
9596
- ValueAllowlistCheck
9697
- ValuePatternCheck(4)
@@ -112,6 +113,7 @@
112113
- 암호
113114
target:
114115
- doc
116+
use_ml: true
115117

116118
- name: IP_ID_PASSWORD_TRIPLE
117119
severity: medium
@@ -128,6 +130,7 @@
128130
- "."
129131
target:
130132
- doc
133+
use_ml: true
131134

132135
- name: ID_PAIR_PASSWD_PAIR
133136
severity: medium
@@ -151,6 +154,7 @@
151154
- 암호
152155
target:
153156
- doc
157+
use_ml: true
154158

155159
- name: ID_PASSWD_PAIR
156160
severity: medium
@@ -173,6 +177,7 @@
173177
- 암호
174178
target:
175179
- doc
180+
use_ml: true
176181

177182
- name: UUID
178183
severity: info
@@ -204,14 +209,30 @@
204209
- code
205210
- doc
206211

212+
- name: Amazon Bedrock API Key
213+
severity: high
214+
confidence: moderate
215+
type: pattern
216+
values:
217+
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?P<value>(ABSK|bedrock-api-key-)[0-9A-Za-z/+]{28,800})(?![0-9A-Za-z/+])
218+
filter_type: GeneralPattern
219+
required_substrings:
220+
- ABSK
221+
- bedrock-api-key-
222+
min_line_len: 44
223+
target:
224+
- code
225+
- doc
226+
207227
- name: AWS Client ID
208228
severity: high
209229
confidence: moderate
210230
type: pattern
211231
values:
212-
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?P<value>(ABIA|ACCA|AGPA|AIDA|AIPA|AKIA|ANPA|ANVA|AROA|APKA|ASCA|ASIA)[0-9A-Z]{16,17})(?![0-9A-Za-z_+-])
232+
- (?:^|/|[^\\0-9A-Za-z+_-]|\\[0abfnrtv]|(?:%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu][0-9A-Fa-f]{4}|\x1B\[[0-9;]{0,80}m)(?P<value>(A3T[0-9A-Z]|ABIA|ACCA|AGPA|AIDA|AIPA|AKIA|ANPA|ANVA|AROA|APKA|ASCA|ASIA)[0-9A-Z]{16,17})(?![0-9A-Za-z_+-])
213233
filter_type: GeneralPattern
214234
required_substrings:
235+
- A3T
215236
- ABIA
216237
- ACCA
217238
- AGPA
@@ -1000,7 +1021,7 @@
10001021
confidence: strong
10011022
type: pattern
10021023
values:
1003-
- (?P<value>(_gitlab_session=|GR1348941|gl(agent|soat|ffct|p[at]t|oas|cbt|imt|[dfr]t)-)[0-9A-Za-z_-]{20,64}(\.[0-9A-Za-z_-]{2,16}){0,2})(?![0-9A-Za-z_-])
1024+
- (?P<value>(_gitlab_session=|GR1348941|gl(agent|soat|ffct|p[at]t|oas|cbt|imt|rtr|[dfrw]t)-)[0-9A-Za-z_-]{20,64}(\.[0-9A-Za-z_-]{2,16}){0,2})(?![0-9A-Za-z_-])
10041025
filter_type:
10051026
- ValuePatternCheck
10061027
min_line_len: 25
@@ -1018,6 +1039,8 @@
10181039
- gldt-
10191040
- glft-
10201041
- glrt-
1042+
- glrtr-
1043+
- glwt-
10211044
target:
10221045
- code
10231046
- doc

credsweeper/scanner/scan_type/multi_pattern.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,9 @@ def run(cls, config: Config, rule: Rule, target: AnalysisTarget) -> List[Candida
3737
Empty list (False) - otherwise.
3838
3939
"""
40-
assert rule.rule_type == RuleType.MULTI, \
41-
"Rules provided to MultiPattern.run should have pattern_type equal to MULTI_PATTERN"
40+
if RuleType.MULTI != rule.rule_type:
41+
raise ValueError(f"Rule `{rule}` provided to `{cls.__name__}`.run "
42+
f"should have pattern_type equal to `{RuleType.MULTI.value}`")
4243

4344
candidates = cls._get_candidates(config, rule, target)
4445

credsweeper/scanner/scan_type/pem_key_pattern.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,9 @@ def run(cls, config: Config, rule: Rule, target: AnalysisTarget) -> List[Candida
2929
and filters defined in rule do not remove current line. Empty list - otherwise
3030
3131
"""
32-
assert rule.rule_type == RuleType.PEM_KEY, \
33-
"Rules provided to PemKeyPattern.run should have pattern_type equal to PEM_KEY_PATTERN"
32+
if RuleType.PEM_KEY != rule.rule_type:
33+
raise ValueError(f"Rule `{rule}` provided to `{cls.__name__}`.run "
34+
f"should have pattern_type equal to `{RuleType.PEM_KEY.value}`")
3435
if candidates := cls._get_candidates(config, rule, target):
3536
candidate = candidates[0]
3637
if pem_lines := PemKeyDetector.detect_pem_key(config, target):

credsweeper/scanner/scan_type/single_pattern.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from typing import List
22

3+
from credsweeper.common.constants import RuleType
34
from credsweeper.config.config import Config
45
from credsweeper.credentials.candidate import Candidate
56
from credsweeper.file_handler.analysis_target import AnalysisTarget
@@ -25,4 +26,8 @@ def run(cls, config: Config, rule: Rule, target: AnalysisTarget) -> List[Candida
2526
2627
"""
2728

29+
if RuleType.PATTERN != rule.rule_type and RuleType.KEYWORD != rule.rule_type:
30+
raise ValueError(f"Rule `{rule}` provided to `{cls.__name__}`.run "
31+
f"should have pattern_type equal to `{RuleType.PATTERN.value}`")
32+
2833
return cls._get_candidates(config, rule, target)

credsweeper/utils/util.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,7 @@ def extract_element_data(element: Any, attr: str) -> str:
352352

353353
@staticmethod
354354
def json_load(file_path: Union[str, Path], encoding=DEFAULT_ENCODING) -> Any:
355-
"""Load dictionary from json file"""
355+
"""Load dictionary from JSON file"""
356356
try:
357357
with open(file_path, "r", encoding=encoding) as f:
358358
return json.load(f)
@@ -362,7 +362,7 @@ def json_load(file_path: Union[str, Path], encoding=DEFAULT_ENCODING) -> Any:
362362

363363
@staticmethod
364364
def json_dump(obj: Any, file_path: Union[str, Path], encoding=DEFAULT_ENCODING, indent=4) -> None:
365-
"""Write dictionary to json file"""
365+
"""Write dictionary to JSON file"""
366366
try:
367367
with open(file_path, "w", encoding=encoding) as f:
368368
json.dump(obj, f, indent=indent)
@@ -371,17 +371,17 @@ def json_dump(obj: Any, file_path: Union[str, Path], encoding=DEFAULT_ENCODING,
371371

372372
@staticmethod
373373
def yaml_load(file_path: Union[str, Path], encoding=DEFAULT_ENCODING) -> Any:
374-
"""Load dictionary from yaml file"""
374+
"""Load dictionary from YAML file"""
375375
try:
376376
with open(file_path, "r", encoding=encoding) as f:
377-
return yaml.load(f, Loader=yaml.FullLoader)
377+
return yaml.safe_load(f)
378378
except Exception as exc:
379379
logger.error(f"Failed to read {file_path} {exc}")
380380
return None
381381

382382
@staticmethod
383383
def yaml_dump(obj: Any, file_path: Union[str, Path], encoding=DEFAULT_ENCODING) -> None:
384-
"""Write dictionary to yaml file"""
384+
"""Write dictionary to YAML file"""
385385
try:
386386
with open(file_path, "w", encoding=encoding) as f:
387387
yaml.dump(obj, f)

0 commit comments

Comments
 (0)