Skip to content

Commit e2cafcb

Browse files
Fix detector feature calculation
1 parent 1a0b625 commit e2cafcb

File tree

1 file changed

+48
-65
lines changed

1 file changed

+48
-65
lines changed

src/detector/detector.py

Lines changed: 48 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -244,86 +244,68 @@ def _get_features(self, query: str) -> np.ndarray:
244244
"""
245245

246246
# Splitting by dots to calculate label length and max length
247+
query = query.strip(".")
247248
label_parts = query.split(".")
249+
250+
levels = {
251+
"fqdn": query,
252+
"secondleveldomain": label_parts[-2] if len(label_parts) >= 2 else "",
253+
"thirdleveldomain": ".".join(label_parts[:-2]) if len(label_parts) > 2 else "",
254+
}
255+
248256
label_length = len(label_parts)
249-
label_max = max(len(part) for part in label_parts)
250-
label_average = len(query.strip("."))
257+
parts = query.split(".")
258+
label_max = len(max(parts, key=str)) if parts else 0
259+
label_average = len(query)
260+
261+
basic_features = np.array([label_length, label_max, label_average], dtype=np.float64)
251262

252-
logger.debug("Get letter frequency")
253263
alc = "abcdefghijklmnopqrstuvwxyz"
264+
query_len = len(query)
254265
freq = np.array(
255-
[query.lower().count(i) / len(query) if len(query) > 0 else 0 for i in alc]
266+
[query.lower().count(c) / query_len if query_len > 0 else 0.0 for c in alc],
267+
dtype=np.float64
256268
)
257269

258270
logger.debug("Get full, alpha, special, and numeric count.")
259-
260271
def calculate_counts(level: str) -> np.ndarray:
261-
if len(level) == 0:
262-
return np.array([0, 0, 0, 0])
263-
264-
full_count = len(level)
265-
alpha_count = sum(c.isalpha() for c in level) / full_count
266-
numeric_count = sum(c.isdigit() for c in level) / full_count
267-
special_count = (
268-
sum(not c.isalnum() and not c.isspace() for c in level) / full_count
269-
)
272+
if not level:
273+
return np.array([0.0, 0.0, 0.0, 0.0], dtype=np.float64)
270274

271-
return np.array([full_count, alpha_count, numeric_count, special_count])
275+
full_count = len(level) / len(level)
276+
alpha_ratio = sum(c.isalpha() for c in level) / len(level)
277+
numeric_ratio = sum(c.isdigit() for c in level) / len(level)
278+
special_ratio = sum(not c.isalnum() and not c.isspace() for c in level) / len(level)
272279

273-
levels = {
274-
"fqdn": query,
275-
"thirdleveldomain": label_parts[0] if len(label_parts) > 2 else "",
276-
"secondleveldomain": label_parts[1] if len(label_parts) > 1 else "",
277-
}
278-
counts = {
279-
level: calculate_counts(level_value)
280-
for level, level_value in levels.items()
281-
}
280+
return np.array([full_count, alpha_ratio, numeric_ratio, special_ratio], dtype=np.float64)
282281

283-
logger.debug(
284-
"Get standard deviation, median, variance, and mean for full, alpha, special, and numeric count."
285-
)
286-
stats = {}
287-
for level, count_array in counts.items():
288-
stats[f"{level}_std"] = np.std(count_array)
289-
stats[f"{level}_var"] = np.var(count_array)
290-
stats[f"{level}_median"] = np.median(count_array)
291-
stats[f"{level}_mean"] = np.mean(count_array)
282+
fqdn_counts = calculate_counts(levels["fqdn"])
283+
third_counts = calculate_counts(levels["thirdleveldomain"])
284+
second_counts = calculate_counts(levels["secondleveldomain"])
292285

293-
logger.debug("Start entropy calculation")
286+
level_features = np.hstack([third_counts, second_counts, fqdn_counts])
294287

295288
def calculate_entropy(s: str) -> float:
296289
if len(s) == 0:
297-
return 0
298-
probabilities = [float(s.count(c)) / len(s) for c in dict.fromkeys(list(s))]
299-
entropy = -sum(p * math.log(p, 2) for p in probabilities)
300-
return entropy
301-
302-
entropy = {level: calculate_entropy(value) for level, value in levels.items()}
303-
304-
logger.debug("Finished entropy calculation")
305-
306-
# Final feature aggregation as a NumPy array
307-
basic_features = np.array([label_length, label_max, label_average])
308-
309-
# Flatten counts and stats for each level into arrays
310-
level_features = np.hstack([counts[level] for level in levels.keys()])
311-
312-
# Entropy features
313-
entropy_features = np.array([entropy[level] for level in levels.keys()])
314-
315-
# Concatenate all features into a single numpy array
316-
all_features = np.concatenate(
317-
[
318-
basic_features,
319-
freq,
320-
# freq_features,
321-
level_features,
322-
# stats_features,
323-
entropy_features,
324-
]
325-
)
326-
290+
return 0.0
291+
probs = [s.count(c) / len(s) for c in dict.fromkeys(s)]
292+
return -sum(p * math.log(p, 2) for p in probs)
293+
logger.debug("Start entropy calculation")
294+
entropy_features = np.array([
295+
calculate_entropy(levels["fqdn"]),
296+
calculate_entropy(levels["thirdleveldomain"]),
297+
calculate_entropy(levels["secondleveldomain"]),
298+
], dtype=np.float64)
299+
300+
logger.debug("Entropy features calculated")
301+
302+
all_features = np.concatenate([
303+
basic_features,
304+
freq,
305+
level_features,
306+
entropy_features
307+
])
308+
327309
logger.debug("Finished data transformation")
328310

329311
return all_features.reshape(1, -1)
@@ -338,8 +320,9 @@ def detect(self) -> None: # pragma: no cover
338320
logger.info("Start detecting malicious requests.")
339321
for message in self.messages:
340322
# TODO predict all messages
323+
# TODO use scalar: self.scaler.transform(self._get_features(message["domain_name"]))
341324
y_pred = self.model.predict_proba(
342-
self.scaler.transform(self._get_features(message["domain_name"]))
325+
self._get_features(message["domain_name"])
343326
)
344327
logger.info(f"Prediction: {y_pred}")
345328
if np.argmax(y_pred, axis=1) == 1 and y_pred[0][1] > THRESHOLD:

0 commit comments

Comments
 (0)