@@ -244,86 +244,68 @@ def _get_features(self, query: str) -> np.ndarray:
244244 """
245245
246246 # Splitting by dots to calculate label length and max length
247+ query = query .strip ("." )
247248 label_parts = query .split ("." )
249+
250+ levels = {
251+ "fqdn" : query ,
252+ "secondleveldomain" : label_parts [- 2 ] if len (label_parts ) >= 2 else "" ,
253+ "thirdleveldomain" : "." .join (label_parts [:- 2 ]) if len (label_parts ) > 2 else "" ,
254+ }
255+
248256 label_length = len (label_parts )
249- label_max = max (len (part ) for part in label_parts )
250- label_average = len (query .strip ("." ))
257+ parts = query .split ("." )
258+ label_max = len (max (parts , key = str )) if parts else 0
259+ label_average = len (query )
260+
261+ basic_features = np .array ([label_length , label_max , label_average ], dtype = np .float64 )
251262
252- logger .debug ("Get letter frequency" )
253263 alc = "abcdefghijklmnopqrstuvwxyz"
264+ query_len = len (query )
254265 freq = np .array (
255- [query .lower ().count (i ) / len (query ) if len (query ) > 0 else 0 for i in alc ]
266+ [query .lower ().count (c ) / query_len if query_len > 0 else 0.0 for c in alc ],
267+ dtype = np .float64
256268 )
257269
258270 logger .debug ("Get full, alpha, special, and numeric count." )
259-
260271 def calculate_counts (level : str ) -> np .ndarray :
261- if len (level ) == 0 :
262- return np .array ([0 , 0 , 0 , 0 ])
263-
264- full_count = len (level )
265- alpha_count = sum (c .isalpha () for c in level ) / full_count
266- numeric_count = sum (c .isdigit () for c in level ) / full_count
267- special_count = (
268- sum (not c .isalnum () and not c .isspace () for c in level ) / full_count
269- )
272+ if not level :
273+ return np .array ([0.0 , 0.0 , 0.0 , 0.0 ], dtype = np .float64 )
270274
271- return np .array ([full_count , alpha_count , numeric_count , special_count ])
275+ full_count = len (level ) / len (level )
276+ alpha_ratio = sum (c .isalpha () for c in level ) / len (level )
277+ numeric_ratio = sum (c .isdigit () for c in level ) / len (level )
278+ special_ratio = sum (not c .isalnum () and not c .isspace () for c in level ) / len (level )
272279
273- levels = {
274- "fqdn" : query ,
275- "thirdleveldomain" : label_parts [0 ] if len (label_parts ) > 2 else "" ,
276- "secondleveldomain" : label_parts [1 ] if len (label_parts ) > 1 else "" ,
277- }
278- counts = {
279- level : calculate_counts (level_value )
280- for level , level_value in levels .items ()
281- }
280+ return np .array ([full_count , alpha_ratio , numeric_ratio , special_ratio ], dtype = np .float64 )
282281
283- logger .debug (
284- "Get standard deviation, median, variance, and mean for full, alpha, special, and numeric count."
285- )
286- stats = {}
287- for level , count_array in counts .items ():
288- stats [f"{ level } _std" ] = np .std (count_array )
289- stats [f"{ level } _var" ] = np .var (count_array )
290- stats [f"{ level } _median" ] = np .median (count_array )
291- stats [f"{ level } _mean" ] = np .mean (count_array )
282+ fqdn_counts = calculate_counts (levels ["fqdn" ])
283+ third_counts = calculate_counts (levels ["thirdleveldomain" ])
284+ second_counts = calculate_counts (levels ["secondleveldomain" ])
292285
293- logger . debug ( "Start entropy calculation" )
286+ level_features = np . hstack ([ third_counts , second_counts , fqdn_counts ] )
294287
295288 def calculate_entropy (s : str ) -> float :
296289 if len (s ) == 0 :
297- return 0
298- probabilities = [float (s .count (c )) / len (s ) for c in dict .fromkeys (list (s ))]
299- entropy = - sum (p * math .log (p , 2 ) for p in probabilities )
300- return entropy
301-
302- entropy = {level : calculate_entropy (value ) for level , value in levels .items ()}
303-
304- logger .debug ("Finished entropy calculation" )
305-
306- # Final feature aggregation as a NumPy array
307- basic_features = np .array ([label_length , label_max , label_average ])
308-
309- # Flatten counts and stats for each level into arrays
310- level_features = np .hstack ([counts [level ] for level in levels .keys ()])
311-
312- # Entropy features
313- entropy_features = np .array ([entropy [level ] for level in levels .keys ()])
314-
315- # Concatenate all features into a single numpy array
316- all_features = np .concatenate (
317- [
318- basic_features ,
319- freq ,
320- # freq_features,
321- level_features ,
322- # stats_features,
323- entropy_features ,
324- ]
325- )
326-
290+ return 0.0
291+ probs = [s .count (c ) / len (s ) for c in dict .fromkeys (s )]
292+ return - sum (p * math .log (p , 2 ) for p in probs )
293+ logger .debug ("Start entropy calculation" )
294+ entropy_features = np .array ([
295+ calculate_entropy (levels ["fqdn" ]),
296+ calculate_entropy (levels ["thirdleveldomain" ]),
297+ calculate_entropy (levels ["secondleveldomain" ]),
298+ ], dtype = np .float64 )
299+
300+ logger .debug ("Entropy features calculated" )
301+
302+ all_features = np .concatenate ([
303+ basic_features ,
304+ freq ,
305+ level_features ,
306+ entropy_features
307+ ])
308+
327309 logger .debug ("Finished data transformation" )
328310
329311 return all_features .reshape (1 , - 1 )
@@ -338,8 +320,9 @@ def detect(self) -> None: # pragma: no cover
338320 logger .info ("Start detecting malicious requests." )
339321 for message in self .messages :
340322 # TODO predict all messages
323+ # TODO use scalar: self.scaler.transform(self._get_features(message["domain_name"]))
341324 y_pred = self .model .predict_proba (
342- self .scaler . transform ( self . _get_features (message ["domain_name" ]) )
325+ self ._get_features (message ["domain_name" ])
343326 )
344327 logger .info (f"Prediction: { y_pred } " )
345328 if np .argmax (y_pred , axis = 1 ) == 1 and y_pred [0 ][1 ] > THRESHOLD :
0 commit comments