forked from WASIK-S/phishing_detection
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstep2_feature_engineering.py
More file actions
58 lines (51 loc) · 1.79 KB
/
step2_feature_engineering.py
File metadata and controls
58 lines (51 loc) · 1.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import pandas as pd
import re
from urllib.parse import urlparse
def extract_features(url):
try:
parsed = urlparse(url)
domain = parsed.netloc
path = parsed.path
return {
"url_length": len(url),
"domain_length": len(domain),
"num_digits": sum(c.isdigit() for c in url),
"num_dots": url.count('.'),
"num_hyphens": url.count('-'),
"has_https": 1 if url.startswith("https") else 0,
"num_subdirs": path.count('/'),
"num_query_params": url.count('='),
"contains_login": 1 if "login" in url.lower() else 0,
"contains_secure": 1 if "secure" in url.lower() else 0,
"contains_account": 1 if "account" in url.lower() else 0,
"contains_update": 1 if "update" in url.lower() else 0,
"contains_verify": 1 if "verify" in url.lower() else 0
}
except:
# If any error occurs, return defaults
return {
"url_length": 0,
"domain_length": 0,
"num_digits": 0,
"num_dots": 0,
"num_hyphens": 0,
"has_https": 0,
"num_subdirs": 0,
"num_query_params": 0,
"contains_login": 0,
"contains_secure": 0,
"contains_account": 0,
"contains_update": 0,
"contains_verify": 0
}
# Load dataset
df = pd.read_csv("data/global_phish_dataset.csv")
# Apply feature extraction
features = df["url"].apply(extract_features).apply(pd.Series)
# Add label
features["label"] = df["label"]
# Save for next step
features.to_csv("processed_features.csv", index=False)
print("✅ Step 2 Completed: processed_features.csv created successfully!")
print("Total rows:", len(features))
print(features.head())