Skip to content

Commit d6e7478

Browse files
authored
Merge pull request #16 from GYFX35/feature/expand-to-all-websites
feat: Expand tool to analyze all websites
2 parents edf432f + 238c30d commit d6e7478

File tree

8 files changed

+184
-79
lines changed

8 files changed

+184
-79
lines changed
0 Bytes
Binary file not shown.
Binary file not shown.
1.66 KB
Binary file not shown.
2 KB
Binary file not shown.
1.24 KB
Binary file not shown.

social_media_analyzer/heuristics.py

Lines changed: 58 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,22 @@
4343
"bankofamerica.com", "chase.com", "wellsfargo.com", "citibank.com",
4444
"hsbc.com", "barclays.com", "deutsche-bank.com", "santander.com"
4545
],
46-
"general": ["google.com"]
46+
"general": ["google.com"],
47+
"general_web": [
48+
"wikipedia.org", "yahoo.com", "live.com", "microsoft.com",
49+
"apple.com", "netflix.com", "twitch.tv", "ebay.com",
50+
"craigslist.org", "imdb.com", "nytimes.com", "theguardian.com",
51+
"bbc.com", "cnn.com", "espn.com", "walmart.com", "target.com",
52+
"bestbuy.com", "homedepot.com", "lowes.com", "costco.com",
53+
"stackoverflow.com", "github.com", "gitlab.com", "wordpress.org",
54+
"wordpress.com", "blogger.com", "tumblr.com", "medium.com",
55+
"quora.com", "flickr.com", "adobe.com", "soundcloud.com",
56+
"spotify.com", "dropbox.com", "box.com", "slack.com",
57+
"salesforce.com", "oracle.com", "sap.com", "ibm.com", "dell.com",
58+
"hp.com", "intel.com", "amd.com", "nvidia.com", "booking.com",
59+
"airbnb.com", "expedia.com", "tripadvisor.com", "fedex.com",
60+
"ups.com", "usps.com", "dhl.com"
61+
]
4762
}
4863

4964

@@ -138,27 +153,48 @@
138153

139154
# Suspicious URL Patterns
140155
# These patterns aim to catch URLs that impersonate legitimate domains.
141-
SUSPICIOUS_URL_PATTERNS = [
142-
# Impersonation using subdomains or hyphens for social media and general platforms
143-
r"https?://(?:[a-z0-9\-]+\.)*(?:facebook|fb|instagram|whatsapp|tiktok|tinder|snapchat|wechat|telegram|twitter|pinterest|linkedin|line|discord|teams|zoom|amazon|alibaba|youtube|skype|vk|reddit|viber|signal|badoo|binance|sharechat)\.com\.[a-z0-9\-]+\.[a-z]+",
144-
r"https?://(?:[a-z0-9\-]+\.)*(?:facebook|fb|instagram|whatsapp|tiktok|tinder|snapchat|wechat|telegram|twitter|pinterest|linkedin|line|discord|teams|zoom|amazon|alibaba|youtube|skype|vk|reddit|viber|signal|badoo|binance|sharechat)-[a-z0-9\-]+\.[a-z]+",
145-
146-
# Impersonation for fintech and banks
147-
r"https?://(?:[a-z0-9\-]+\.)*(?:paypal|stripe|payoneer|bankofamerica|chase|wellsfargo|citibank|hsbc|barclays)\.com\.[a-z0-9\-]+\.[a-z]+",
148-
r"https?://(?:[a-z0-9\-]+\.)*(?:paypal|stripe|payoneer|bankofamerica|chase|wellsfargo|citibank|hsbc|barclays)-[a-z0-9\-]+\.[a-z]+",
149-
150-
# Common URL shorteners
151-
r"https?://bit\.ly",
152-
r"https?://goo\.gl",
153-
r"https?://t\.co",
154-
# IP Address URLs
155-
r"https?://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",
156-
# Generic suspicious keywords in the domain
157-
r"https?://[^/]*(?:login|secure|account|update|verify|support|admin)[^/]*\.(?:biz|info|tk|ml|ga|cf|gq|xyz|club|top|loan|work|online|site)",
158-
# Very long subdomains or many hyphens
159-
r"https?://(?:[a-z0-9\-]+\.){4,}",
160-
r"https?://[^/]*\-.*\-.*\-.*[a-z]+",
161-
]
156+
def generate_suspicious_url_patterns(legitimate_domains):
157+
"""
158+
Generates regex patterns to detect URLs impersonating legitimate domains.
159+
"""
160+
all_service_keywords = set()
161+
for platform, domains in legitimate_domains.items():
162+
if platform not in ["general", "general_web", "banks"]:
163+
all_service_keywords.add(platform)
164+
for domain in domains:
165+
# Add the core part of the domain, e.g., "facebook" from "facebook.com"
166+
keyword = domain.split('.')[0]
167+
if keyword != "com" and len(keyword) > 2:
168+
all_service_keywords.add(keyword)
169+
170+
# Remove very generic keywords that might cause false positives
171+
all_service_keywords -= {'google', 'apple', 'microsoft'}
172+
173+
# Create a regex group of all keywords
174+
keyword_group = "|".join(re.escape(k) for k in sorted(list(all_service_keywords), key=len, reverse=True))
175+
176+
patterns = [
177+
# Impersonation using subdomains or hyphens, e.g., "facebook.security-alert.com" or "facebook-login.com"
178+
# This now uses the dynamically generated keyword group
179+
r"https?://(?:[a-z0-9\-]+\.)*(?:" + keyword_group + r")\.(?:[a-z0-9\-]+)\.(?:[a-z]+)",
180+
r"https?://(?:[a-z0-9\-]+\.)*(?:" + keyword_group + r")-(?:[a-z0-9\-]+)\.(?:[a-z]+)",
181+
182+
# Common URL shorteners
183+
r"https?://bit\.ly",
184+
r"https?://goo\.gl",
185+
r"https?://t\.co",
186+
# IP Address URLs
187+
r"https?://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",
188+
# Generic suspicious keywords in the domain combined with suspicious TLDs
189+
r"https?://[^/]*(?:login|secure|account|update|verify|support|admin)[^/]*\.(?:biz|info|tk|ml|ga|cf|gq|xyz|club|top|loan|work|online|site)",
190+
# Very long subdomains (potential phishing)
191+
r"https?://(?:[a-z0-9\-]+\.){4,}",
192+
# Multiple hyphens in the domain (potential phishing)
193+
r"https?://[^/]*\-.*\-.*\-.*[a-z]+",
194+
]
195+
return patterns
196+
197+
SUSPICIOUS_URL_PATTERNS = generate_suspicious_url_patterns(LEGITIMATE_DOMAINS)
162198

163199

164200
# --- Scoring Weights ---

social_media_analyzer/main.py

Lines changed: 107 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,51 @@
11
from . import fake_profile_detector
22
from . import scam_detector
33

4-
def main():
5-
"""Main function to run the social media analyzer."""
6-
print("--- Social Media Analyzer ---")
7-
print("This tool helps you analyze social media profiles and messages for potential scams.")
4+
def analyze_website_url():
5+
"""Analyzes a website URL for potential scams."""
6+
url_to_check = input("Please enter the full URL you want to analyze: ").strip()
7+
if not url_to_check:
8+
print("No URL entered.")
9+
return
10+
11+
# Ensure the URL has a scheme
12+
if not url_to_check.startswith(('http://', 'https://')):
13+
url_to_check = 'http://' + url_to_check
814

9-
platforms = ["facebook", "instagram", "whatsapp", "tiktok", "tinder", "snapchat", "wechat", "telegram", "twitter", "pinterest", "linkedin", "line", "discord", "teams", "zoom", "amazon", "alibaba", "youtube", "skype", "vk", "reddit", "email", "viber", "signal", "badoo", "binance", "sharechat", "browser", "messenger", "qzone", "qq", "vimeo", "musical.ly"]
15+
print("\n--- Analyzing URL ---")
16+
is_susp, reason = scam_detector.is_url_suspicious(url_to_check, platform="general_web")
17+
if is_susp:
18+
print(f"\n[!] The URL '{url_to_check}' is flagged as IMMEDIATELY SUSPICIOUS.")
19+
print(f"Reason: {reason}")
20+
# We can stop here as the URL itself is a major red flag
21+
return
22+
else:
23+
print(f"\n[+] The URL '{url_to_check}' does not match common suspicious patterns.")
24+
print("Now analyzing the website's content...")
25+
26+
# Analyze the content of the website
27+
content_result = scam_detector.analyze_url_content(url_to_check)
28+
29+
print("\n--- Website Content Analysis Results ---")
30+
if "error" in content_result:
31+
print(f"Could not analyze website content: {content_result['error']}")
32+
elif not content_result.get("indicators_found"):
33+
print("No specific scam indicators were found in the website content.")
34+
else:
35+
print(f"Score: {content_result['score']} (Higher is more suspicious)")
36+
print("Indicators Found:")
37+
for indicator in content_result['indicators_found']:
38+
print(f"- {indicator}")
39+
40+
def analyze_social_media():
41+
"""Handles the analysis of social media platforms."""
42+
platforms = sorted([
43+
"facebook", "instagram", "whatsapp", "tiktok", "tinder", "snapchat",
44+
"wechat", "telegram", "twitter", "pinterest", "linkedin", "line",
45+
"discord", "teams", "zoom", "amazon", "alibaba", "youtube", "skype",
46+
"vk", "reddit", "email", "viber", "signal", "badoo", "binance",
47+
"sharechat", "messenger", "qzone", "qq", "vimeo", "musical.ly"
48+
])
1049

1150
while True:
1251
print("\nSelect the social media platform you want to analyze:")
@@ -23,61 +62,72 @@ def main():
2362
except ValueError:
2463
print("Invalid input. Please enter a number.")
2564

26-
if platform == "browser":
27-
url_to_check = input("Please enter the URL you want to analyze: ").strip()
28-
if url_to_check:
29-
is_susp, reason = scam_detector.is_url_suspicious(url_to_check)
30-
print("\n--- URL Analysis Results ---")
31-
if is_susp:
32-
print(f"The URL '{url_to_check}' is SUSPICIOUS.")
33-
print(f"Reason: {reason}")
34-
else:
35-
print(f"The URL '{url_to_check}' does not seem suspicious.")
36-
print(f"Details: {reason}")
37-
else:
38-
print("No URL entered.")
39-
else:
40-
while True:
41-
print(f"\nWhat do you want to do for {platform.capitalize()}?")
42-
print("1. Analyze a profile for signs of being fake.")
43-
print("2. Analyze a profile for identity usurpation.")
44-
print("3. Analyze a message for phishing or scam attempts.")
65+
while True:
66+
print(f"\nWhat do you want to do for {platform.capitalize()}?")
67+
print("1. Analyze a profile for signs of being fake.")
68+
print("2. Analyze a profile for identity usurpation.")
69+
print("3. Analyze a message for phishing or scam attempts.")
4570

46-
try:
47-
analysis_choice = int(input("Enter your choice (1-3): "))
48-
if analysis_choice == 1:
49-
profile_url = input(f"Enter the {platform.capitalize()} profile URL to analyze: ").strip()
50-
if profile_url:
51-
fake_profile_detector.analyze_profile_based_on_user_input(profile_url, platform)
52-
else:
53-
print("No profile URL entered.")
54-
break
55-
elif analysis_choice == 2:
56-
profile_url = input(f"Enter the {platform.capitalize()} profile URL to analyze for impersonation: ").strip()
57-
if profile_url:
58-
fake_profile_detector.analyze_identity_usurpation(profile_url, platform)
59-
else:
60-
print("No profile URL entered.")
61-
break
62-
elif analysis_choice == 3:
63-
message = input("Paste the message you want to analyze: ").strip()
64-
if message:
65-
result = scam_detector.analyze_text_for_scams(message, platform)
66-
print("\n--- Scam Analysis Results ---")
67-
print(f"Score: {result['score']} (Higher is more suspicious)")
68-
print("Indicators Found:")
69-
if result['indicators_found']:
70-
for indicator in result['indicators_found']:
71-
print(f"- {indicator}")
72-
else:
73-
print("No specific scam indicators were found.")
71+
try:
72+
analysis_choice = int(input("Enter your choice (1-3): "))
73+
if analysis_choice == 1:
74+
profile_url = input(f"Enter the {platform.capitalize()} profile URL to analyze: ").strip()
75+
if profile_url:
76+
fake_profile_detector.analyze_profile_based_on_user_input(profile_url, platform)
77+
else:
78+
print("No profile URL entered.")
79+
break
80+
elif analysis_choice == 2:
81+
profile_url = input(f"Enter the {platform.capitalize()} profile URL to analyze for impersonation: ").strip()
82+
if profile_url:
83+
fake_profile_detector.analyze_identity_usurpation(profile_url, platform)
84+
else:
85+
print("No profile URL entered.")
86+
break
87+
elif analysis_choice == 3:
88+
message = input("Paste the message you want to analyze: ").strip()
89+
if message:
90+
result = scam_detector.analyze_text_for_scams(message, platform)
91+
print("\n--- Scam Analysis Results ---")
92+
print(f"Score: {result['score']} (Higher is more suspicious)")
93+
print("Indicators Found:")
94+
if result['indicators_found']:
95+
for indicator in result['indicators_found']:
96+
print(f"- {indicator}")
7497
else:
75-
print("No message entered.")
76-
break
98+
print("No specific scam indicators were found.")
7799
else:
78-
print("Invalid choice. Please try again.")
79-
except ValueError:
80-
print("Invalid input. Please enter a number.")
100+
print("No message entered.")
101+
break
102+
else:
103+
print("Invalid choice. Please try again.")
104+
except ValueError:
105+
print("Invalid input. Please enter a number.")
106+
107+
def main():
108+
"""Main function to run the security analyzer."""
109+
print("--- Universal Security Analyzer ---")
110+
print("This tool helps you analyze social media, messages, and websites for potential scams.")
111+
112+
while True:
113+
print("\n--- Main Menu ---")
114+
print("1. Analyze a Social Media Platform")
115+
print("2. Analyze a Website URL")
116+
print("3. Exit")
117+
118+
try:
119+
choice = int(input("Enter your choice (1-3): "))
120+
if choice == 1:
121+
analyze_social_media()
122+
elif choice == 2:
123+
analyze_website_url()
124+
elif choice == 3:
125+
print("Exiting. Stay safe!")
126+
break
127+
else:
128+
print("Invalid choice. Please try again.")
129+
except ValueError:
130+
print("Invalid input. Please enter a number.")
81131

82132
if __name__ == '__main__':
83133
main()

social_media_analyzer/scam_detector.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import re
2+
import urllib.request
23
from urllib.parse import urlparse
34
from .heuristics import (
45
URGENCY_KEYWORDS,
@@ -130,3 +131,21 @@ def analyze_text_for_scams(text_content, platform=None):
130131
"urls_analyzed": urls_analyzed_details
131132
}
132133

134+
def analyze_url_content(url):
135+
"""
136+
Fetches the content of a URL and analyzes it for scams.
137+
"""
138+
try:
139+
# Add a user-agent to avoid being blocked by some websites
140+
headers = {'User-Agent': 'Mozilla/5.0'}
141+
request = urllib.request.Request(url, headers=headers)
142+
with urllib.request.urlopen(request, timeout=10) as response:
143+
if response.status == 200:
144+
html_content = response.read().decode('utf-8', errors='ignore')
145+
# Simple regex to strip HTML tags, not perfect but avoids new dependencies
146+
text_content = re.sub(r'<[^>]+>', '', html_content)
147+
return analyze_text_for_scams(text_content, platform="general_web")
148+
else:
149+
return {"error": f"Failed to fetch URL: HTTP status code {response.status}"}
150+
except Exception as e:
151+
return {"error": f"An error occurred: {e}"}

0 commit comments

Comments
 (0)