@@ -20,26 +20,23 @@ class EntropyEvaluation(Feature):
2020
2121 """
2222
23- def __init__ (self ) -> None :
24- """Class initializer"""
25- super ().__init__ ()
26- # Max size of ML analyzed value is ML_HUNK but value may be bigger
27- self .hunk_size = 4 * ML_HUNK
28- self .log2_cache : Dict [int , float ] = {x : math .log2 (x ) for x in range (4 , self .hunk_size + 1 )}
29- self .char_sets : List [Set [str ]] = [set (x .value ) for x in Chars ]
23+ # Max size of ML analyzed value is ML_HUNK but value may be bigger
24+ HUNK_SIZE = 4 * ML_HUNK
25+ LOG2_CACHE : Dict [int , float ] = {x : math .log2 (x ) for x in range (4 , 4 * ML_HUNK + 1 )}
26+ CHAR_SET : List [Set [str ]] = [set (x .value ) for x in Chars ]
27+ RESULT_SIZE = 3 + len (Chars )
3028
3129 def extract (self , candidate : Candidate ) -> np .ndarray :
3230 """Returns real entropy and possible sets of characters"""
3331 # only head of value will be analyzed
34- result : np .ndarray = np .zeros (shape = 3 + len ( self . char_sets ) , dtype = np .float32 )
35- value = candidate .line_data_list [0 ].value [:self . hunk_size ]
32+ result : np .ndarray = np .zeros (shape = EntropyEvaluation . RESULT_SIZE , dtype = np .float32 )
33+ value = candidate .line_data_list [0 ].value [:EntropyEvaluation . HUNK_SIZE ]
3634 size = len (value )
3735 uniq , counts = np .unique (list (value ), return_counts = True )
3836 if MIN_DATA_LEN <= size :
3937 # evaluate the entropy for a value of at least 4
4038 probabilities = counts / size
41- hartley_entropy = self .log2_cache .get (size , - 1.0 )
42- assert hartley_entropy , str (candidate )
39+ hartley_entropy = EntropyEvaluation .LOG2_CACHE .get (size , - 1.0 )
4340
4441 # renyi_entropy alpha=0.5
4542 sum_prob_05 = np .sum (probabilities ** 0.5 )
@@ -59,7 +56,7 @@ def extract(self, candidate: Candidate) -> np.ndarray:
5956 # check charset for non-zero value
6057 # use the new variable to deal with mypy
6158 uniq_set = set (uniq )
62- for n , i in enumerate (self . char_sets , start = 3 ):
59+ for n , i in enumerate (EntropyEvaluation . CHAR_SET , start = 3 ):
6360 if not uniq_set .difference (i ):
6461 result [n ] = 1.0
6562
0 commit comments