PhishSense/extract_features.py at main · ahmaad-ansari/PhishSense · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
"""
URL Feature Extraction Module

This script defines functions for extracting features from HTML content of URLs and saving the results to a CSV file.
The module includes the 'extract_features' function, which takes a URL and its type as input, performs a web request,
parses the HTML content, and extracts features such as title and the number of links. Additionally, the 'process_urls'
function utilizes 'extract_features' to process a list of URLs with their corresponding types, creating a Pandas DataFrame
from the extracted features and saving it to a specified output CSV file.

Usage:
  from extract_features import extract_features, process_urls
  features_list = extract_features('http://example.com', 'legitimate')
  process_urls([(url1, type1), (url2, type2)], 'output_features.csv')

Functions:
  - extract_features(url, website_type): Extracts features from the HTML content of a URL.
  - process_urls(urls_and_types, output_file): Processes a list of URLs with types and saves the features to a CSV file.

Note: The 'extract_features' function can be modified to include additional features as needed.

Author: Ahmaad Ansari
Date: March 10, 2024
"""

import os
import logging
import requests
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
import re

# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Set up a logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Configure the logger to print to the console
console = logging.StreamHandler()
formatter = logging.Formatter('%(levelname)s: %(message)s')

# Set up colors for logging levels
RED = "\033[1;31m"
GREEN = "\033[1;32m"
RESET = "\033[0m"

# Create a class to colorize log messages
class ColoredFormatter(logging.Formatter):
    def format(self, record):
        if record.levelno == logging.ERROR:
            return RED + super().format(record) + RESET
        elif record.levelno == logging.INFO:
            return GREEN + super().format(record) + RESET
        else:
            return super().format(record)

console.setFormatter(ColoredFormatter('%(levelname)s: %(message)s'))

# Add the console handler to the logger
logger.addHandler(console)

def extract_features(url, website_type):
    result = None  # Initialize result to None

    try:
        # Ensure the URL has the correct protocol (https:// or http://)
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url

        # Set a user agent to mimic a web browser
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

        # Make a request to the URL with headers
        response = requests.get(url, headers=headers, timeout=5, verify=False)

        # Check for successful response status code
        response.raise_for_status()

        # Log successful request
        logger.info(f"Successfully fetched {url}")

    except requests.exceptions.RequestException as e:
        # Log error if the request fails
        logger.error(f"Error fetching {url}: {e}")
    else:
        # If there was no error, proceed with further processing
        result = extract_features_from_html(response.text, url, website_type)

    return result

def extract_features_from_html(html_content, url, website_type):
    # Use BeautifulSoup to parse HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Binary Features
    has_login_form = len(soup.select('form[action*="login"]')) > 0
    has_https = url.startswith('https://')
    has_iframe = len(soup.find_all('iframe')) > 0
    phishing_words = ['login', 'password', 'account', 'verify', 'security', 'authenticate', 'update', 'confirm', 'identity', 'validation', 'billing', 'unusual', 'suspicious', 'urgent', 'information', 'recovery', 'suspend', 'fraud', 'alert', 'compromise']
    has_phishing_words = any(word in html_content.lower() for word in phishing_words)
    has_title = bool(soup.title)
    keywords = ['official', 'authorized', 'genuine', 'secure', 'trusted', 'verified', 'legitimate']
    has_keywords = any(word in html_content.lower() for word in keywords)
    has_external_links = any('href' in link.attrs and link['href'].startswith(('http://', 'https://')) for link in soup.find_all('a'))
    has_popular_script_libraries = any(lib in html_content.lower() for lib in ['jquery', 'angular', 'react', 'vue'])

    # Quantitative Features
    num_links = len(soup.find_all('a'))
    num_images = len(soup.find_all('img'))
    num_scripts = len(soup.find_all('script'))
    num_styles = len(soup.find_all('link', rel='stylesheet'))
    num_forms = len(soup.find_all('form'))
    num_input_tags = len(soup.find_all('input'))

    # Content Length Features
    content_length = len(html_content)
    avg_word_length = sum(len(word) for word in re.findall(r'\b\w+\b', html_content)) / len(re.findall(r'\b\w+\b', html_content)) if len(re.findall(r'\b\w+\b', html_content)) > 0 else 0

    # Heuristic Features
    avg_link_text_length = sum(len(link.text) for link in soup.find_all('a')) / num_links if num_links > 0 else 0

    # URL Structure Features
    num_subdomains = len(url.split('.')) - 2  # subtract 2 to exclude 'http://' or 'https://'
    num_special_chars = sum(1 for char in url if char in ['-', '_', '.', '~', ':', '/', '?', '#', '[', ']', '@', '!', '$', '&', "'", '(', ')', '*', '+', ',', ';', '='])  # count special characters in the URL

    return {
        'url': url,
        'type': website_type,
        'has_login_form': has_login_form,
        'has_https': has_https,
        'has_iframe': has_iframe,
        'has_phishing_words': has_phishing_words,
        'num_links': num_links,
        'num_images': num_images,
        'num_scripts': num_scripts,
        'num_styles': num_styles,
        'num_forms': num_forms,
        'num_input_tags': num_input_tags,
        'content_length': content_length,
        'avg_word_length': avg_word_length,
        'has_title': has_title,
        'has_keywords': has_keywords,
        'has_external_links': has_external_links,
        'avg_link_text_length': avg_link_text_length,
        'has_popular_script_libraries': has_popular_script_libraries,
        'num_subdomains': num_subdomains,
        'num_special_chars': num_special_chars
    }

import os

def process_urls(urls_and_types, output_file):
    # Initialize an empty list to store features
    features_list = []

    # Check if the output CSV file already exists
    if os.path.exists(output_file):
        # If it exists, read the existing data
        existing_data = pd.read_csv(output_file)
        # Append the new features to the existing data
        features_list += existing_data.to_dict('records')

    for url, website_type in urls_and_types:
        # Extract features for each URL
        features = extract_features(url, website_type)

        # Check if features were successfully extracted
        if features is not None:
            features_list.append(features)

    # Convert the list of dictionaries to a Pandas DataFrame
    df = pd.DataFrame(features_list)

    # Save the DataFrame to a CSV file
    df.to_csv(output_file, index=False)