11import re
2+ from typing import Optional
23
3- from credsweeper .common .constants import DEFAULT_PATTERN_LEN
4+ from credsweeper .common .constants import DEFAULT_PATTERN_LEN , MAX_LINE_LENGTH
45from credsweeper .config .config import Config
56from credsweeper .credentials .line_data import LineData
67from credsweeper .file_handler .analysis_target import AnalysisTarget
78from credsweeper .filters .filter import Filter
89
10+ MAX_PATTERN_LENGTH = int (MAX_LINE_LENGTH ).bit_length () # maximal value length might be 8000
11+
912
1013class ValuePatternCheck (Filter ):
1114 """Check if candidate value contain specific pattern.
@@ -22,36 +25,55 @@ class ValuePatternCheck(Filter):
2225 Default pattern LEN is 4
2326 """
2427
25- def __init__ (self , config : Config = None , pattern_len : int = DEFAULT_PATTERN_LEN ):
28+ default_patterns = list (
29+ re .compile (fr"(\S)\1{{{ str (x - 1 ) if DEFAULT_PATTERN_LEN < x else '3' } ,}}" )
30+ for x in range (MAX_PATTERN_LENGTH + 1 ))
31+ various_pattern_lengths = list (x if DEFAULT_PATTERN_LEN < x else DEFAULT_PATTERN_LEN
32+ for x in range (MAX_PATTERN_LENGTH + 1 ))
33+
34+ def __init__ (self , config : Config = None , pattern_len : Optional [int ] = None ):
2635 """Create ValuePatternCheck with a specific pattern_len to check.
2736
2837 Args:
2938 config: pattern len to use during check. DEFAULT_PATTERN_LEN by default
39+ pattern_len: size of constant pattern length for any value size or None for dynamic pattern size
3040
3141 """
32- self .pattern_len = pattern_len
33- # use non whitespace symbol pattern
34- self .pattern = re .compile (fr"(\S)\1{{{ str (self .pattern_len - 1 )} ,}}" )
42+ if pattern_len is None :
43+ self .pattern_len = - 1
44+ # pattern length depends on value length
45+ self .pattern_lengths = ValuePatternCheck .various_pattern_lengths
46+ self .patterns = ValuePatternCheck .default_patterns
47+ elif isinstance (pattern_len , int ) and DEFAULT_PATTERN_LEN <= pattern_len :
48+ self .pattern_len = pattern_len
49+ # constant pattern for any value length
50+ pattern = re .compile (fr"(\S)\1{{{ str (pattern_len - 1 )} ,}}" )
51+ self .pattern_lengths = list (pattern_len for _ in range (MAX_PATTERN_LENGTH + 1 ))
52+ self .patterns = list (pattern for _ in range (MAX_PATTERN_LENGTH + 1 ))
53+ else :
54+ raise ValueError (f"Wrong type of pattern length { type (pattern_len )} = { repr (pattern_len )} " )
3555
36- def equal_pattern_check (self , value : str ) -> bool :
56+ def equal_pattern_check (self , value : str , bit_length : int ) -> bool :
3757 """Check if candidate value contain 4 and more same chars or numbers sequences.
3858
3959 Args:
4060 value: string variable, credential candidate value
61+ bit_length: speedup for len(value).bit_length()
4162
4263 Return:
4364 True if contain and False if not
4465
4566 """
46- if self .pattern .findall (value ):
67+ if self .patterns [ bit_length ] .findall (value ):
4768 return True
4869 return False
4970
50- def ascending_pattern_check (self , value : str ) -> bool :
71+ def ascending_pattern_check (self , value : str , bit_length : int ) -> bool :
5172 """Check if candidate value contain 4 and more ascending chars or numbers sequences.
5273
5374 Arg:
5475 value: credential candidate value
76+ bit_length: speedup for len(value).bit_length()
5577
5678 Return:
5779 True if contain and False if not
@@ -64,15 +86,16 @@ def ascending_pattern_check(self, value: str) -> bool:
6486 else :
6587 count = 1
6688 continue
67- if count == self .pattern_len :
89+ if count == self .pattern_lengths [ bit_length ] :
6890 return True
6991 return False
7092
71- def descending_pattern_check (self , value : str ) -> bool :
93+ def descending_pattern_check (self , value : str , bit_length : int ) -> bool :
7294 """Check if candidate value contain 4 and more descending chars or numbers sequences.
7395
7496 Arg:
7597 value: string variable, credential candidate value
98+ bit_length: speedup for len(value).bit_length()
7699
77100 Return:
78101 boolean variable. True if contain and False if not
@@ -85,59 +108,44 @@ def descending_pattern_check(self, value: str) -> bool:
85108 else :
86109 count = 1
87110 continue
88- if count == self .pattern_len :
111+ if count == self .pattern_lengths [ bit_length ] :
89112 return True
90113 return False
91114
92- def check_val (self , value : str ) -> bool :
115+ def check_val (self , value : str , bit_length : int ) -> bool :
93116 """Cumulative value check.
94117
95118 Arg:
96119 value: string variable, credential candidate value
120+ bit_length: speedup for len(value).bit_length()
97121
98122 Return:
99123 boolean variable. True if contain and False if not
100124
101125 """
102- if self .equal_pattern_check (value ):
126+ if self .equal_pattern_check (value , bit_length ):
103127 return True
104- if self .ascending_pattern_check (value ):
128+ if self .ascending_pattern_check (value , bit_length ):
105129 return True
106- if self .descending_pattern_check (value ):
130+ if self .descending_pattern_check (value , bit_length ):
107131 return True
108132 return False
109133
110- def duple_pattern_check (self , value : str ) -> bool :
134+ def duple_pattern_check (self , value : str , bit_length : int ) -> bool :
111135 """Check if candidate value is a duplet value with possible patterns.
112136
113137 Arg:
114138 value: string variable, credential candidate value
139+ bit_length: speedup for len(value).bit_length()
115140
116141 Return:
117142 boolean variable. True if contain and False if not
118143
119144 """
120- # 001122334455... case
121- pair_duple = True
122- # 0102030405... case
123- even_duple = True
124- even_prev = value [0 ]
125145 even_value = value [0 ::2 ]
126- # 1020304050... case
127- odd_duple = True
128- odd_prev = value [1 ]
129146 odd_value = value [1 ::2 ]
130- for even_i , odd_i in zip (even_value , odd_value ):
131- pair_duple &= even_i == odd_i
132- even_duple &= even_i == even_prev
133- odd_duple &= odd_i == odd_prev
134- if not pair_duple and not even_duple and not odd_duple :
135- break
136- else :
137- if pair_duple or odd_duple :
138- return self .check_val (even_value )
139- if even_duple :
140- return self .check_val (odd_value )
147+ if self .check_val (even_value , bit_length ) and self .check_val (odd_value , bit_length ):
148+ return True
141149 return False
142150
143151 def run (self , line_data : LineData , target : AnalysisTarget ) -> bool :
@@ -151,13 +159,22 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
151159 boolean variable. True, if need to filter candidate and False if left
152160
153161 """
154- if len (line_data .value ) < self .pattern_len :
162+ value_length = len (line_data .value )
163+ bit_length = max (DEFAULT_PATTERN_LEN , value_length .bit_length ())
164+
165+ if MAX_PATTERN_LENGTH < bit_length :
166+ # huge values may contain anything
167+ return False
168+
169+ if 0 <= value_length < self .pattern_len or value_length < self .pattern_lengths [bit_length ]:
170+ # too short value
155171 return True
156172
157- if self .check_val (line_data .value ):
173+ if self .check_val (line_data .value , bit_length ):
158174 return True
159175
160- if 2 * self .pattern_len <= len (line_data .value ) and self .duple_pattern_check (line_data .value ):
176+ if 2 * self .pattern_lengths [bit_length ] <= value_length \
177+ and self .duple_pattern_check (line_data .value , bit_length ):
161178 return True
162179
163180 return False
0 commit comments