Skip to content

Commit 238c30d

Browse files
feat: Expand tool to analyze all websites
This commit expands the functionality of the Universal Security Analyzer to allow the analysis of any website URL, not just specific social media platforms. Key changes include: - Refactored the main menu in `main.py` to provide a clear choice between analyzing social media and analyzing a general website URL. - Added a new function `analyze_website_url` to handle the new workflow. - Implemented `analyze_url_content` in `scam_detector.py` to fetch, parse, and analyze the text content of a given URL for scam indicators. - Updated `heuristics.py` to be more generic: - Added a `general_web` category to `LEGITIMATE_DOMAINS` with a list of common websites. - Dynamically generate `SUSPICIOUS_URL_PATTERNS` from the keywords in `LEGITIMATE_DOMAINS`, making the phishing detection more maintainable and scalable. - The old "browser" platform has been removed in favor of the more powerful and direct website analysis feature.
1 parent edf432f commit 238c30d

File tree

8 files changed

+184
-79
lines changed

8 files changed

+184
-79
lines changed
0 Bytes
Binary file not shown.
Binary file not shown.
1.66 KB
Binary file not shown.
2 KB
Binary file not shown.
1.24 KB
Binary file not shown.

social_media_analyzer/heuristics.py

Lines changed: 58 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,22 @@
4343
"bankofamerica.com", "chase.com", "wellsfargo.com", "citibank.com",
4444
"hsbc.com", "barclays.com", "deutsche-bank.com", "santander.com"
4545
],
46-
"general": ["google.com"]
46+
"general": ["google.com"],
47+
"general_web": [
48+
"wikipedia.org", "yahoo.com", "live.com", "microsoft.com",
49+
"apple.com", "netflix.com", "twitch.tv", "ebay.com",
50+
"craigslist.org", "imdb.com", "nytimes.com", "theguardian.com",
51+
"bbc.com", "cnn.com", "espn.com", "walmart.com", "target.com",
52+
"bestbuy.com", "homedepot.com", "lowes.com", "costco.com",
53+
"stackoverflow.com", "github.com", "gitlab.com", "wordpress.org",
54+
"wordpress.com", "blogger.com", "tumblr.com", "medium.com",
55+
"quora.com", "flickr.com", "adobe.com", "soundcloud.com",
56+
"spotify.com", "dropbox.com", "box.com", "slack.com",
57+
"salesforce.com", "oracle.com", "sap.com", "ibm.com", "dell.com",
58+
"hp.com", "intel.com", "amd.com", "nvidia.com", "booking.com",
59+
"airbnb.com", "expedia.com", "tripadvisor.com", "fedex.com",
60+
"ups.com", "usps.com", "dhl.com"
61+
]
4762
}
4863

4964

@@ -138,27 +153,48 @@
138153

139154
# Suspicious URL Patterns
140155
# These patterns aim to catch URLs that impersonate legitimate domains.
141-
SUSPICIOUS_URL_PATTERNS = [
142-
# Impersonation using subdomains or hyphens for social media and general platforms
143-
r"https?://(?:[a-z0-9\-]+\.)*(?:facebook|fb|instagram|whatsapp|tiktok|tinder|snapchat|wechat|telegram|twitter|pinterest|linkedin|line|discord|teams|zoom|amazon|alibaba|youtube|skype|vk|reddit|viber|signal|badoo|binance|sharechat)\.com\.[a-z0-9\-]+\.[a-z]+",
144-
r"https?://(?:[a-z0-9\-]+\.)*(?:facebook|fb|instagram|whatsapp|tiktok|tinder|snapchat|wechat|telegram|twitter|pinterest|linkedin|line|discord|teams|zoom|amazon|alibaba|youtube|skype|vk|reddit|viber|signal|badoo|binance|sharechat)-[a-z0-9\-]+\.[a-z]+",
145-
146-
# Impersonation for fintech and banks
147-
r"https?://(?:[a-z0-9\-]+\.)*(?:paypal|stripe|payoneer|bankofamerica|chase|wellsfargo|citibank|hsbc|barclays)\.com\.[a-z0-9\-]+\.[a-z]+",
148-
r"https?://(?:[a-z0-9\-]+\.)*(?:paypal|stripe|payoneer|bankofamerica|chase|wellsfargo|citibank|hsbc|barclays)-[a-z0-9\-]+\.[a-z]+",
149-
150-
# Common URL shorteners
151-
r"https?://bit\.ly",
152-
r"https?://goo\.gl",
153-
r"https?://t\.co",
154-
# IP Address URLs
155-
r"https?://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",
156-
# Generic suspicious keywords in the domain
157-
r"https?://[^/]*(?:login|secure|account|update|verify|support|admin)[^/]*\.(?:biz|info|tk|ml|ga|cf|gq|xyz|club|top|loan|work|online|site)",
158-
# Very long subdomains or many hyphens
159-
r"https?://(?:[a-z0-9\-]+\.){4,}",
160-
r"https?://[^/]*\-.*\-.*\-.*[a-z]+",
161-
]
156+
def generate_suspicious_url_patterns(legitimate_domains):
157+
"""
158+
Generates regex patterns to detect URLs impersonating legitimate domains.
159+
"""
160+
all_service_keywords = set()
161+
for platform, domains in legitimate_domains.items():
162+
if platform not in ["general", "general_web", "banks"]:
163+
all_service_keywords.add(platform)
164+
for domain in domains:
165+
# Add the core part of the domain, e.g., "facebook" from "facebook.com"
166+
keyword = domain.split('.')[0]
167+
if keyword != "com" and len(keyword) > 2:
168+
all_service_keywords.add(keyword)
169+
170+
# Remove very generic keywords that might cause false positives
171+
all_service_keywords -= {'google', 'apple', 'microsoft'}
172+
173+
# Create a regex group of all keywords
174+
keyword_group = "|".join(re.escape(k) for k in sorted(list(all_service_keywords), key=len, reverse=True))
175+
176+
patterns = [
177+
# Impersonation using subdomains or hyphens, e.g., "facebook.security-alert.com" or "facebook-login.com"
178+
# This now uses the dynamically generated keyword group
179+
r"https?://(?:[a-z0-9\-]+\.)*(?:" + keyword_group + r")\.(?:[a-z0-9\-]+)\.(?:[a-z]+)",
180+
r"https?://(?:[a-z0-9\-]+\.)*(?:" + keyword_group + r")-(?:[a-z0-9\-]+)\.(?:[a-z]+)",
181+
182+
# Common URL shorteners
183+
r"https?://bit\.ly",
184+
r"https?://goo\.gl",
185+
r"https?://t\.co",
186+
# IP Address URLs
187+
r"https?://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",
188+
# Generic suspicious keywords in the domain combined with suspicious TLDs
189+
r"https?://[^/]*(?:login|secure|account|update|verify|support|admin)[^/]*\.(?:biz|info|tk|ml|ga|cf|gq|xyz|club|top|loan|work|online|site)",
190+
# Very long subdomains (potential phishing)
191+
r"https?://(?:[a-z0-9\-]+\.){4,}",
192+
# Multiple hyphens in the domain (potential phishing)
193+
r"https?://[^/]*\-.*\-.*\-.*[a-z]+",
194+
]
195+
return patterns
196+
197+
SUSPICIOUS_URL_PATTERNS = generate_suspicious_url_patterns(LEGITIMATE_DOMAINS)
162198

163199

164200
# --- Scoring Weights ---

social_media_analyzer/main.py

Lines changed: 107 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,51 @@
11
from . import fake_profile_detector
22
from . import scam_detector
33

4-
def main():
5-
"""Main function to run the social media analyzer."""
6-
print("--- Social Media Analyzer ---")
7-
print("This tool helps you analyze social media profiles and messages for potential scams.")
4+
def analyze_website_url():
5+
"""Analyzes a website URL for potential scams."""
6+
url_to_check = input("Please enter the full URL you want to analyze: ").strip()
7+
if not url_to_check:
8+
print("No URL entered.")
9+
return
10+
11+
# Ensure the URL has a scheme
12+
if not url_to_check.startswith(('http://', 'https://')):
13+
url_to_check = 'http://' + url_to_check
814

9-
platforms = ["facebook", "instagram", "whatsapp", "tiktok", "tinder", "snapchat", "wechat", "telegram", "twitter", "pinterest", "linkedin", "line", "discord", "teams", "zoom", "amazon", "alibaba", "youtube", "skype", "vk", "reddit", "email", "viber", "signal", "badoo", "binance", "sharechat", "browser", "messenger", "qzone", "qq", "vimeo", "musical.ly"]
15+
print("\n--- Analyzing URL ---")
16+
is_susp, reason = scam_detector.is_url_suspicious(url_to_check, platform="general_web")
17+
if is_susp:
18+
print(f"\n[!] The URL '{url_to_check}' is flagged as IMMEDIATELY SUSPICIOUS.")
19+
print(f"Reason: {reason}")
20+
# We can stop here as the URL itself is a major red flag
21+
return
22+
else:
23+
print(f"\n[+] The URL '{url_to_check}' does not match common suspicious patterns.")
24+
print("Now analyzing the website's content...")
25+
26+
# Analyze the content of the website
27+
content_result = scam_detector.analyze_url_content(url_to_check)
28+
29+
print("\n--- Website Content Analysis Results ---")
30+
if "error" in content_result:
31+
print(f"Could not analyze website content: {content_result['error']}")
32+
elif not content_result.get("indicators_found"):
33+
print("No specific scam indicators were found in the website content.")
34+
else:
35+
print(f"Score: {content_result['score']} (Higher is more suspicious)")
36+
print("Indicators Found:")
37+
for indicator in content_result['indicators_found']:
38+
print(f"- {indicator}")
39+
40+
def analyze_social_media():
41+
"""Handles the analysis of social media platforms."""
42+
platforms = sorted([
43+
"facebook", "instagram", "whatsapp", "tiktok", "tinder", "snapchat",
44+
"wechat", "telegram", "twitter", "pinterest", "linkedin", "line",
45+
"discord", "teams", "zoom", "amazon", "alibaba", "youtube", "skype",
46+
"vk", "reddit", "email", "viber", "signal", "badoo", "binance",
47+
"sharechat", "messenger", "qzone", "qq", "vimeo", "musical.ly"
48+
])
1049

1150
while True:
1251
print("\nSelect the social media platform you want to analyze:")
@@ -23,61 +62,72 @@ def main():
2362
except ValueError:
2463
print("Invalid input. Please enter a number.")
2564

26-
if platform == "browser":
27-
url_to_check = input("Please enter the URL you want to analyze: ").strip()
28-
if url_to_check:
29-
is_susp, reason = scam_detector.is_url_suspicious(url_to_check)
30-
print("\n--- URL Analysis Results ---")
31-
if is_susp:
32-
print(f"The URL '{url_to_check}' is SUSPICIOUS.")
33-
print(f"Reason: {reason}")
34-
else:
35-
print(f"The URL '{url_to_check}' does not seem suspicious.")
36-
print(f"Details: {reason}")
37-
else:
38-
print("No URL entered.")
39-
else:
40-
while True:
41-
print(f"\nWhat do you want to do for {platform.capitalize()}?")
42-
print("1. Analyze a profile for signs of being fake.")
43-
print("2. Analyze a profile for identity usurpation.")
44-
print("3. Analyze a message for phishing or scam attempts.")
65+
while True:
66+
print(f"\nWhat do you want to do for {platform.capitalize()}?")
67+
print("1. Analyze a profile for signs of being fake.")
68+
print("2. Analyze a profile for identity usurpation.")
69+
print("3. Analyze a message for phishing or scam attempts.")
4570

46-
try:
47-
analysis_choice = int(input("Enter your choice (1-3): "))
48-
if analysis_choice == 1:
49-
profile_url = input(f"Enter the {platform.capitalize()} profile URL to analyze: ").strip()
50-
if profile_url:
51-
fake_profile_detector.analyze_profile_based_on_user_input(profile_url, platform)
52-
else:
53-
print("No profile URL entered.")
54-
break
55-
elif analysis_choice == 2:
56-
profile_url = input(f"Enter the {platform.capitalize()} profile URL to analyze for impersonation: ").strip()
57-
if profile_url:
58-
fake_profile_detector.analyze_identity_usurpation(profile_url, platform)
59-
else:
60-
print("No profile URL entered.")
61-
break
62-
elif analysis_choice == 3:
63-
message = input("Paste the message you want to analyze: ").strip()
64-
if message:
65-
result = scam_detector.analyze_text_for_scams(message, platform)
66-
print("\n--- Scam Analysis Results ---")
67-
print(f"Score: {result['score']} (Higher is more suspicious)")
68-
print("Indicators Found:")
69-
if result['indicators_found']:
70-
for indicator in result['indicators_found']:
71-
print(f"- {indicator}")
72-
else:
73-
print("No specific scam indicators were found.")
71+
try:
72+
analysis_choice = int(input("Enter your choice (1-3): "))
73+
if analysis_choice == 1:
74+
profile_url = input(f"Enter the {platform.capitalize()} profile URL to analyze: ").strip()
75+
if profile_url:
76+
fake_profile_detector.analyze_profile_based_on_user_input(profile_url, platform)
77+
else:
78+
print("No profile URL entered.")
79+
break
80+
elif analysis_choice == 2:
81+
profile_url = input(f"Enter the {platform.capitalize()} profile URL to analyze for impersonation: ").strip()
82+
if profile_url:
83+
fake_profile_detector.analyze_identity_usurpation(profile_url, platform)
84+
else:
85+
print("No profile URL entered.")
86+
break
87+
elif analysis_choice == 3:
88+
message = input("Paste the message you want to analyze: ").strip()
89+
if message:
90+
result = scam_detector.analyze_text_for_scams(message, platform)
91+
print("\n--- Scam Analysis Results ---")
92+
print(f"Score: {result['score']} (Higher is more suspicious)")
93+
print("Indicators Found:")
94+
if result['indicators_found']:
95+
for indicator in result['indicators_found']:
96+
print(f"- {indicator}")
7497
else:
75-
print("No message entered.")
76-
break
98+
print("No specific scam indicators were found.")
7799
else:
78-
print("Invalid choice. Please try again.")
79-
except ValueError:
80-
print("Invalid input. Please enter a number.")
100+
print("No message entered.")
101+
break
102+
else:
103+
print("Invalid choice. Please try again.")
104+
except ValueError:
105+
print("Invalid input. Please enter a number.")
106+
107+
def main():
108+
"""Main function to run the security analyzer."""
109+
print("--- Universal Security Analyzer ---")
110+
print("This tool helps you analyze social media, messages, and websites for potential scams.")
111+
112+
while True:
113+
print("\n--- Main Menu ---")
114+
print("1. Analyze a Social Media Platform")
115+
print("2. Analyze a Website URL")
116+
print("3. Exit")
117+
118+
try:
119+
choice = int(input("Enter your choice (1-3): "))
120+
if choice == 1:
121+
analyze_social_media()
122+
elif choice == 2:
123+
analyze_website_url()
124+
elif choice == 3:
125+
print("Exiting. Stay safe!")
126+
break
127+
else:
128+
print("Invalid choice. Please try again.")
129+
except ValueError:
130+
print("Invalid input. Please enter a number.")
81131

82132
if __name__ == '__main__':
83133
main()

social_media_analyzer/scam_detector.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import re
2+
import urllib.request
23
from urllib.parse import urlparse
34
from .heuristics import (
45
URGENCY_KEYWORDS,
@@ -130,3 +131,21 @@ def analyze_text_for_scams(text_content, platform=None):
130131
"urls_analyzed": urls_analyzed_details
131132
}
132133

134+
def analyze_url_content(url):
135+
"""
136+
Fetches the content of a URL and analyzes it for scams.
137+
"""
138+
try:
139+
# Add a user-agent to avoid being blocked by some websites
140+
headers = {'User-Agent': 'Mozilla/5.0'}
141+
request = urllib.request.Request(url, headers=headers)
142+
with urllib.request.urlopen(request, timeout=10) as response:
143+
if response.status == 200:
144+
html_content = response.read().decode('utf-8', errors='ignore')
145+
# Simple regex to strip HTML tags, not perfect but avoids new dependencies
146+
text_content = re.sub(r'<[^>]+>', '', html_content)
147+
return analyze_text_for_scams(text_content, platform="general_web")
148+
else:
149+
return {"error": f"Failed to fetch URL: HTTP status code {response.status}"}
150+
except Exception as e:
151+
return {"error": f"An error occurred: {e}"}

0 commit comments

Comments
 (0)