Skip to content

Commit b3af35e

Browse files
committed
softreset
1 parent e4b5ed1 commit b3af35e

31 files changed

+18137
-941
lines changed

.ci/benchmark.txt

Lines changed: 45 additions & 45 deletions
Large diffs are not rendered by default.

.github/workflows/benchmark.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ jobs:
3131
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - 2024.10.23
3232
with:
3333
repository: Samsung/CredData
34-
ref: d425c1b7600407ca5a82f2379fdc8627d194fb39
34+
ref: ceb8cf86e0db95e1f1159cd7b480e80d8188a0bf
3535

3636
- name: Markup hashing
3737
run: |
@@ -87,7 +87,7 @@ jobs:
8787
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - 2024.10.23
8888
with:
8989
repository: Samsung/CredData
90-
ref: d425c1b7600407ca5a82f2379fdc8627d194fb39
90+
ref: ceb8cf86e0db95e1f1159cd7b480e80d8188a0bf
9191

9292
- name: Markup hashing
9393
run: |
@@ -190,7 +190,7 @@ jobs:
190190
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - 2024.10.23
191191
with:
192192
repository: Samsung/CredData
193-
ref: d425c1b7600407ca5a82f2379fdc8627d194fb39
193+
ref: ceb8cf86e0db95e1f1159cd7b480e80d8188a0bf
194194

195195
- name: Markup hashing
196196
run: |
@@ -378,7 +378,7 @@ jobs:
378378
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - 2024.10.23
379379
with:
380380
repository: Samsung/CredData
381-
ref: d425c1b7600407ca5a82f2379fdc8627d194fb39
381+
ref: ceb8cf86e0db95e1f1159cd7b480e80d8188a0bf
382382

383383
- name: Markup hashing
384384
run: |

credsweeper/common/keyword_pattern.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,22 @@
44
class KeywordPattern:
55
"""Pattern set of keyword types"""
66
directive = r"(?P<directive>(?:(?:[#%]define|%global)(?:\s|\\t)|\bset))?"
7-
key_left = r"(?:\\[nrt]|%[0-9a-f]{2}|\s)*" \
7+
key_left = r"(?:\\[nrt]|(\\\\*u00|%)[0-9a-f]{2}|\s)*" \
88
r"(?P<variable>(([`'\"]{1,8}[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?;,%]*)" \
99
r"(?P<keyword>"
1010
# there will be inserted a keyword
1111
key_right = r")" \
1212
r"[^%:='\"`<>({?!&;\n]*" \
1313
r")" \
14-
r"(&(quot|apos);|%[0-9a-f]{2}|[`'\"])*" \
14+
r"(&(quot|apos);|(\\\\*u00|%)[0-9a-f]{2}|[`'\"])*" \
1515
r")" # <variable>
1616
separator = r"(?(directive)|(\s|\\{1,8}[tnr])*\]?(\s|\\{1,8}[tnr])*)" \
1717
r"(?P<separator>:(\s[a-z]{3,9}[?]?\s)?=|:(?!:)|=(>|&gt;|(\\\\*u00|%)26gt;)|!==|!=|===|==|=~|=" \
1818
r"|(?(directive)(\\t|\s|\((?!\))){1,80}|%3d))" \
1919
r"(\s|\\{1,8}[tnr])*"
2020
# might be curly, square or parenthesis with words before
2121
wrap = r"(?P<wrap>(" \
22-
r"(new(\s|\\{1,8}[tnr]|byte|char|string|\[\]){1,8})?" \
22+
r"((\s|\\{1,8}[tnr]|new|byte|char|string|\[\]){1,8})?" \
2323
r"(?P<get>([_a-z][0-9a-z_.\[\]]*\.)get|(os\.)?getenv)?" \
2424
r"([0-9a-z_.]|::|-(>|&gt;))*" \
2525
r"\s*" \

credsweeper/filters/group/group.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,13 +58,13 @@ def get_keyword_base_filters(config: Config) -> List[Filter]:
5858
ValueTokenCheck(),
5959
]
6060
if not config.doc:
61-
filters.extend([ValuePatternCheck(pattern_len=config.pattern_len), ValueNotAllowedPatternCheck()])
61+
filters.extend([ValuePatternCheck(config), ValueNotAllowedPatternCheck()])
6262
return filters
6363

6464
@staticmethod
6565
def get_pattern_base_filters(config: Config) -> List[Filter]:
6666
"""return base filters for pattern"""
6767
return [ #
6868
LineSpecificKeyCheck(), #
69-
ValuePatternCheck(pattern_len=config.pattern_len), #
69+
ValuePatternCheck(config), #
7070
]

credsweeper/filters/value_array_dictionary_check.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ class ValueArrayDictionaryCheck(Filter):
1414
`token = {'root'}` would be kept
1515
"""
1616

17-
PATTERN = re.compile(r"\[('|\")?[^,]+('|\")?\]")
17+
PATTERN = re.compile(r"\[['\"]?[^,]+['\"]?]")
1818

1919
def __init__(self, config: Config = None) -> None:
2020
pass
@@ -32,11 +32,12 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
3232
"""
3333
if line_data.is_well_quoted_value:
3434
return False
35+
# not well quoted value
3536
if line_data.wrap and "byte" in line_data.wrap.lower():
3637
return False
3738
if self.PATTERN.search(line_data.value):
3839
return True
39-
if line_data.wrap and not line_data.is_well_quoted_value and ('[' in line_data.wrap or '(' in line_data.wrap):
40+
if line_data.wrap and (line_data.wrap.endswith('[') or line_data.wrap.endswith('(')):
4041
return True
4142

4243
return False
Lines changed: 55 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
import re
2+
from typing import Optional
23

3-
from credsweeper.common.constants import DEFAULT_PATTERN_LEN
4+
from credsweeper.common.constants import DEFAULT_PATTERN_LEN, MAX_LINE_LENGTH
45
from credsweeper.config.config import Config
56
from credsweeper.credentials.line_data import LineData
67
from credsweeper.file_handler.analysis_target import AnalysisTarget
78
from credsweeper.filters.filter import Filter
89

10+
MAX_PATTERN_LENGTH = int(MAX_LINE_LENGTH).bit_length() # maximal value length might be 8000
11+
912

1013
class ValuePatternCheck(Filter):
1114
"""Check if candidate value contain specific pattern.
@@ -22,36 +25,55 @@ class ValuePatternCheck(Filter):
2225
Default pattern LEN is 4
2326
"""
2427

25-
def __init__(self, config: Config = None, pattern_len: int = DEFAULT_PATTERN_LEN):
28+
default_patterns = list(
29+
re.compile(fr"(\S)\1{{{str(x - 1) if DEFAULT_PATTERN_LEN < x else '3'},}}")
30+
for x in range(MAX_PATTERN_LENGTH + 1))
31+
various_pattern_lengths = list(x if DEFAULT_PATTERN_LEN < x else DEFAULT_PATTERN_LEN
32+
for x in range(MAX_PATTERN_LENGTH + 1))
33+
34+
def __init__(self, config: Config = None, pattern_len: Optional[int] = None):
2635
"""Create ValuePatternCheck with a specific pattern_len to check.
2736
2837
Args:
2938
config: pattern len to use during check. DEFAULT_PATTERN_LEN by default
39+
pattern_len: size of constant pattern length for any value size or None for dynamic pattern size
3040
3141
"""
32-
self.pattern_len = pattern_len
33-
# use non whitespace symbol pattern
34-
self.pattern = re.compile(fr"(\S)\1{{{str(self.pattern_len - 1)},}}")
42+
if pattern_len is None:
43+
self.pattern_len = -1
44+
# pattern length depends on value length
45+
self.pattern_lengths = ValuePatternCheck.various_pattern_lengths
46+
self.patterns = ValuePatternCheck.default_patterns
47+
elif isinstance(pattern_len, int) and DEFAULT_PATTERN_LEN <= pattern_len:
48+
self.pattern_len = pattern_len
49+
# constant pattern for any value length
50+
pattern = re.compile(fr"(\S)\1{{{str(pattern_len - 1)},}}")
51+
self.pattern_lengths = list(pattern_len for _ in range(MAX_PATTERN_LENGTH + 1))
52+
self.patterns = list(pattern for _ in range(MAX_PATTERN_LENGTH + 1))
53+
else:
54+
raise ValueError(f"Wrong type of pattern length {type(pattern_len)} = {repr(pattern_len)}")
3555

36-
def equal_pattern_check(self, value: str) -> bool:
56+
def equal_pattern_check(self, value: str, bit_length: int) -> bool:
3757
"""Check if candidate value contain 4 and more same chars or numbers sequences.
3858
3959
Args:
4060
value: string variable, credential candidate value
61+
bit_length: speedup for len(value).bit_length()
4162
4263
Return:
4364
True if contain and False if not
4465
4566
"""
46-
if self.pattern.findall(value):
67+
if self.patterns[bit_length].findall(value):
4768
return True
4869
return False
4970

50-
def ascending_pattern_check(self, value: str) -> bool:
71+
def ascending_pattern_check(self, value: str, bit_length: int) -> bool:
5172
"""Check if candidate value contain 4 and more ascending chars or numbers sequences.
5273
5374
Arg:
5475
value: credential candidate value
76+
bit_length: speedup for len(value).bit_length()
5577
5678
Return:
5779
True if contain and False if not
@@ -64,15 +86,16 @@ def ascending_pattern_check(self, value: str) -> bool:
6486
else:
6587
count = 1
6688
continue
67-
if count == self.pattern_len:
89+
if count == self.pattern_lengths[bit_length]:
6890
return True
6991
return False
7092

71-
def descending_pattern_check(self, value: str) -> bool:
93+
def descending_pattern_check(self, value: str, bit_length: int) -> bool:
7294
"""Check if candidate value contain 4 and more descending chars or numbers sequences.
7395
7496
Arg:
7597
value: string variable, credential candidate value
98+
bit_length: speedup for len(value).bit_length()
7699
77100
Return:
78101
boolean variable. True if contain and False if not
@@ -85,59 +108,44 @@ def descending_pattern_check(self, value: str) -> bool:
85108
else:
86109
count = 1
87110
continue
88-
if count == self.pattern_len:
111+
if count == self.pattern_lengths[bit_length]:
89112
return True
90113
return False
91114

92-
def check_val(self, value: str) -> bool:
115+
def check_val(self, value: str, bit_length: int) -> bool:
93116
"""Cumulative value check.
94117
95118
Arg:
96119
value: string variable, credential candidate value
120+
bit_length: speedup for len(value).bit_length()
97121
98122
Return:
99123
boolean variable. True if contain and False if not
100124
101125
"""
102-
if self.equal_pattern_check(value):
126+
if self.equal_pattern_check(value, bit_length):
103127
return True
104-
if self.ascending_pattern_check(value):
128+
if self.ascending_pattern_check(value, bit_length):
105129
return True
106-
if self.descending_pattern_check(value):
130+
if self.descending_pattern_check(value, bit_length):
107131
return True
108132
return False
109133

110-
def duple_pattern_check(self, value: str) -> bool:
134+
def duple_pattern_check(self, value: str, bit_length: int) -> bool:
111135
"""Check if candidate value is a duplet value with possible patterns.
112136
113137
Arg:
114138
value: string variable, credential candidate value
139+
bit_length: speedup for len(value).bit_length()
115140
116141
Return:
117142
boolean variable. True if contain and False if not
118143
119144
"""
120-
# 001122334455... case
121-
pair_duple = True
122-
# 0102030405... case
123-
even_duple = True
124-
even_prev = value[0]
125145
even_value = value[0::2]
126-
# 1020304050... case
127-
odd_duple = True
128-
odd_prev = value[1]
129146
odd_value = value[1::2]
130-
for even_i, odd_i in zip(even_value, odd_value):
131-
pair_duple &= even_i == odd_i
132-
even_duple &= even_i == even_prev
133-
odd_duple &= odd_i == odd_prev
134-
if not pair_duple and not even_duple and not odd_duple:
135-
break
136-
else:
137-
if pair_duple or odd_duple:
138-
return self.check_val(even_value)
139-
if even_duple:
140-
return self.check_val(odd_value)
147+
if self.check_val(even_value, bit_length) and self.check_val(odd_value, bit_length):
148+
return True
141149
return False
142150

143151
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
@@ -151,13 +159,22 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
151159
boolean variable. True, if need to filter candidate and False if left
152160
153161
"""
154-
if len(line_data.value) < self.pattern_len:
162+
value_length = len(line_data.value)
163+
bit_length = max(DEFAULT_PATTERN_LEN, value_length.bit_length())
164+
165+
if MAX_PATTERN_LENGTH < bit_length:
166+
# huge values may contain anything
167+
return False
168+
169+
if 0 <= value_length < self.pattern_len or value_length < self.pattern_lengths[bit_length]:
170+
# too short value
155171
return True
156172

157-
if self.check_val(line_data.value):
173+
if self.check_val(line_data.value, bit_length):
158174
return True
159175

160-
if 2 * self.pattern_len <= len(line_data.value) and self.duple_pattern_check(line_data.value):
176+
if 2 * self.pattern_lengths[bit_length] <= value_length \
177+
and self.duple_pattern_check(line_data.value, bit_length):
161178
return True
162179

163180
return False

credsweeper/filters/value_string_type_check.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ class ValueStringTypeCheck(Filter):
2323
False otherwise
2424
"""
2525

26-
MULTIBYTE_PATTERN = re.compile(r"(\s*(0x)?[0-9a-f]{1,3}\s*,){8,80}", flags=re.IGNORECASE)
26+
MULTIBYTE_PATTERN = re.compile(r"((0x)?[0-9a-f]{1,16}[UL]*)(\s*,\s*((0x)?[0-9a-f]{1,16}[UL]*)){3}",
27+
flags=re.IGNORECASE)
2728

2829
def __init__(self, config: Config) -> None:
2930
self.check_for_literals = config.check_for_literals
@@ -42,7 +43,7 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
4243
if not self.check_for_literals or line_data.url_part:
4344
return False
4445

45-
if ValueStringTypeCheck.MULTIBYTE_PATTERN.match(line_data.value):
46+
if ValueStringTypeCheck.MULTIBYTE_PATTERN.search(line_data.value):
4647
return False
4748

4849
if line_data.is_source_file_with_quotes() \

credsweeper/filters/value_token_check.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ class ValueTokenCheck(Filter):
1717
1818
"""
1919

20-
SPLIT_PATTERN = r"(?<!,) (?!,)|;|\)|\(|{|}|<|>|\[|\]|`"
20+
SPLIT_PATTERN = re.compile(r"(?<!\W) (?!\W)|[;(){}<>[\]`]")
2121

2222
def __init__(self, config: Config = None) -> None:
2323
pass

0 commit comments

Comments
 (0)