11import re
22import urllib .request
3+ import requests
4+ import os
35from urllib .parse import urlparse
46from .heuristics import (
57 URGENCY_KEYWORDS ,
1719 SUSPICIOUS_URL_PATTERNS
1820)
1921
22+ def check_google_safe_browsing (url , api_key ):
23+ """
24+ Checks a URL against the Google Safe Browsing API.
25+ Returns a tuple: (is_suspicious, reason)
26+ """
27+ if not api_key :
28+ return False , "Google Safe Browsing API key not configured."
29+
30+ api_url = f"https://safebrowsing.googleapis.com/v4/threatMatches:find?key={ api_key } "
31+ payload = {
32+ "client" : {
33+ "clientId" : "social-media-analyzer" ,
34+ "clientVersion" : "1.0.0"
35+ },
36+ "threatInfo" : {
37+ "threatTypes" : ["MALWARE" , "SOCIAL_ENGINEERING" , "UNWANTED_SOFTWARE" , "POTENTIALLY_HARMFUL_APPLICATION" ],
38+ "platformTypes" : ["ANY_PLATFORM" ],
39+ "threatEntryTypes" : ["URL" ],
40+ "threatEntries" : [{"url" : url }]
41+ }
42+ }
43+ try :
44+ response = requests .post (api_url , json = payload , timeout = 10 )
45+ if response .status_code == 200 :
46+ data = response .json ()
47+ if "matches" in data :
48+ threat_type = data ["matches" ][0 ]["threatType" ]
49+ return True , f"Flagged by Google Safe Browsing as { threat_type } ."
50+ else :
51+ return False , "Clean according to Google Safe Browsing."
52+ else :
53+ return False , f"Google Safe Browsing API error: { response .status_code } "
54+ except requests .RequestException as e :
55+ return False , f"Could not connect to Google Safe Browsing: { e } "
56+
2057def get_legitimate_domains (platform = None ):
2158 """
2259 Returns a list of legitimate domains for a given platform,
@@ -35,16 +72,24 @@ def get_domain_from_url(url):
3572 domain = url .split ("/" )[0 ].split ("?" )[0 ]
3673 return domain .lower ()
3774
38- def is_url_suspicious (url , platform = None ):
75+ def is_url_suspicious (url , platform = None , api_key = None ):
3976 """
40- Checks if a URL is suspicious based on various patterns and lists.
77+ Checks if a URL is suspicious based on various patterns and lists,
78+ including Google Safe Browsing.
4179 Returns a tuple: (bool_is_suspicious, reason_string)
4280 """
81+ # 1. Google Safe Browsing Check
82+ if api_key :
83+ is_susp , reason = check_google_safe_browsing (url , api_key )
84+ if is_susp :
85+ return True , reason
86+
87+ # 2. Local Heuristics
4388 normalized_url = url .lower ()
4489 domain = get_domain_from_url (url )
4590 legitimate_domains = get_legitimate_domains (platform )
4691
47- # 1. Check if the domain is in the legitimate list for the platform
92+ # Check if the domain is in the legitimate list for the platform
4893 if domain in legitimate_domains :
4994 # Still check for impersonation patterns that might include the legit domain
5095 for pattern in SUSPICIOUS_URL_PATTERNS :
@@ -53,24 +98,24 @@ def is_url_suspicious(url, platform=None):
5398 return True , f"URL impersonates a legitimate domain: { pattern } "
5499 return False , "URL domain is on the legitimate list."
55100
56- # 2. Check against known suspicious patterns
101+ # Check against known suspicious patterns
57102 for pattern in SUSPICIOUS_URL_PATTERNS :
58103 if re .search (pattern , normalized_url , re .IGNORECASE ):
59104 return True , f"URL matches suspicious pattern: { pattern } "
60105
61- # 3. Check for suspicious TLDs
106+ # Check for suspicious TLDs
62107 suspicious_tld_regex = re .compile (r"\.(" + "|" .join (tld .lstrip ('.' ) for tld in SUSPICIOUS_TLDS ) + r")$" , re .IGNORECASE )
63108 if suspicious_tld_regex .search (domain ):
64109 return True , f"URL uses a potentially suspicious TLD."
65110
66- # 4. Check if a known legitimate service name is part of the domain, but it's not official
111+ # Check if a known legitimate service name is part of the domain, but it's not official
67112 for service in LEGITIMATE_DOMAINS .keys ():
68113 if service != "general" and service in domain :
69114 return True , f"URL contains the name of a legitimate service ('{ service } ') but is not an official domain."
70115
71116 return False , "URL does not match common suspicious patterns."
72117
73- def analyze_text_for_scams (text_content , platform = None ):
118+ def analyze_text_for_scams (text_content , platform = None , api_key = None ):
74119 """
75120 Analyzes a block of text content for various scam indicators.
76121 """
@@ -103,10 +148,14 @@ def analyze_text_for_scams(text_content, platform=None):
103148 # 2. Regex-based checks
104149 found_urls = URL_PATTERN .findall (text_content )
105150 for url_str in found_urls :
106- is_susp , reason = is_url_suspicious (url_str , platform )
151+ is_susp , reason = is_url_suspicious (url_str , platform , api_key )
107152 url_analysis = {"url" : url_str , "is_suspicious" : is_susp , "reason" : reason }
108153 if is_susp :
109- score += HEURISTIC_WEIGHTS .get ("SUSPICIOUS_URL_PATTERN" , 3.0 )
154+ # Increase score significantly if flagged by Google
155+ if "Google Safe Browsing" in reason :
156+ score += HEURISTIC_WEIGHTS .get ("GOOGLE_SAFE_BROWSING_HIT" , 10.0 )
157+ else :
158+ score += HEURISTIC_WEIGHTS .get ("SUSPICIOUS_URL_PATTERN" , 3.0 )
110159 indicators_found .append (f"Suspicious URL found: { url_str } (Reason: { reason } )" )
111160 urls_analyzed_details .append (url_analysis )
112161
0 commit comments