|
1 | | -# 20.04.2024 |
2 | | - |
3 | 1 | import re |
4 | 2 | import os |
5 | 3 | import json |
@@ -47,6 +45,90 @@ def get_new_tld(full_url): |
47 | 45 |
|
48 | 46 | return None |
49 | 47 |
|
| 48 | +def get_enhanced_headers(): |
| 49 | + ua = ua_generator.generate(device='desktop', browser='chrome') |
| 50 | + headers = ua.headers.get() |
| 51 | + |
| 52 | + additional_headers = { |
| 53 | + 'DNT': '1', |
| 54 | + 'Upgrade-Insecure-Requests': '1', |
| 55 | + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', |
| 56 | + 'Accept-Language': 'en-US,en;q=0.9,it;q=0.8', |
| 57 | + 'Accept-Encoding': 'gzip, deflate, br', |
| 58 | + 'Cache-Control': 'max-age=0', |
| 59 | + 'Connection': 'keep-alive', |
| 60 | + 'Referer': 'https://www.google.com/', |
| 61 | + } |
| 62 | + |
| 63 | + headers.update(additional_headers) |
| 64 | + return headers |
| 65 | + |
| 66 | +def extract_redirect_from_403(response, original_url): |
| 67 | + redirect_headers = ['location', 'refresh', 'x-redirect-to', 'x-location', 'redirect'] |
| 68 | + for header in redirect_headers: |
| 69 | + if header in response.headers: |
| 70 | + return response.headers[header] |
| 71 | + |
| 72 | + try: |
| 73 | + content = response.text |
| 74 | + |
| 75 | + js_patterns = [ |
| 76 | + r'window\.location\.href\s*=\s*["\']([^"\']+)["\']', |
| 77 | + r'window\.location\s*=\s*["\']([^"\']+)["\']', |
| 78 | + r'location\.href\s*=\s*["\']([^"\']+)["\']', |
| 79 | + r'document\.location\s*=\s*["\']([^"\']+)["\']', |
| 80 | + r'top\.location\.href\s*=\s*["\']([^"\']+)["\']', |
| 81 | + r'parent\.location\s*=\s*["\']([^"\']+)["\']' |
| 82 | + ] |
| 83 | + |
| 84 | + for pattern in js_patterns: |
| 85 | + match = re.search(pattern, content, re.IGNORECASE) |
| 86 | + if match: |
| 87 | + return match.group(1) |
| 88 | + |
| 89 | + meta_patterns = [ |
| 90 | + r'<meta[^>]*http-equiv=["\']?refresh["\']?[^>]*content=["\'][^"\']*url=([^"\'>\s]+)', |
| 91 | + r'<meta[^>]*content=["\'][^"\']*url=([^"\'>\s]+)[^>]*http-equiv=["\']?refresh["\']?' |
| 92 | + ] |
| 93 | + |
| 94 | + for pattern in meta_patterns: |
| 95 | + match = re.search(pattern, content, re.IGNORECASE) |
| 96 | + if match: |
| 97 | + return match.group(1) |
| 98 | + |
| 99 | + text_patterns = [ |
| 100 | + r'[Rr]edirect(?:ed)?\s+to:?\s*([^\s<>"\']+)', |
| 101 | + r'[Nn]ew\s+[Uu][Rr][Ll]:?\s*([^\s<>"\']+)', |
| 102 | + r'[Mm]oved\s+to:?\s*([^\s<>"\']+)', |
| 103 | + r'[Ff]ound\s+at:?\s*([^\s<>"\']+)', |
| 104 | + r'[Gg]o\s+to:?\s*([^\s<>"\']+)', |
| 105 | + r'[Vv]isit:?\s*([^\s<>"\']+)', |
| 106 | + r'https?://[^\s<>"\']+\.[a-z]{2,}[^\s<>"\']*' |
| 107 | + ] |
| 108 | + |
| 109 | + for pattern in text_patterns: |
| 110 | + match = re.search(pattern, content) |
| 111 | + if match: |
| 112 | + potential_url = match.group(1) if '(' in pattern else match.group(0) |
| 113 | + if potential_url.startswith(('http://', 'https://', '//')): |
| 114 | + return potential_url |
| 115 | + |
| 116 | + link_patterns = [ |
| 117 | + r'<a[^>]*href=["\']([^"\']+)["\'][^>]*>(?:click here|continue|proceed|go here)', |
| 118 | + r'<link[^>]*rel=["\']?canonical["\']?[^>]*href=["\']([^"\']+)["\']', |
| 119 | + r'<base[^>]*href=["\']([^"\']+)["\']' |
| 120 | + ] |
| 121 | + |
| 122 | + for pattern in link_patterns: |
| 123 | + match = re.search(pattern, content, re.IGNORECASE) |
| 124 | + if match: |
| 125 | + return match.group(1) |
| 126 | + |
| 127 | + except Exception: |
| 128 | + pass |
| 129 | + |
| 130 | + return None |
| 131 | + |
50 | 132 | def extract_domain_from_response(response, original_url): |
51 | 133 | if 'location' in response.headers: |
52 | 134 | return response.headers['location'] |
@@ -108,7 +190,10 @@ def extract_domain_from_response(response, original_url): |
108 | 190 |
|
109 | 191 | return None |
110 | 192 |
|
111 | | -def try_url(url_to_try, headers, timeout=15): |
| 193 | +def try_url(url_to_try, headers=None, timeout=15): |
| 194 | + if headers is None: |
| 195 | + headers = get_enhanced_headers() |
| 196 | + |
112 | 197 | try: |
113 | 198 | with httpx.Client(headers=headers, timeout=timeout, follow_redirects=False) as client: |
114 | 199 | response = client.get(url_to_try) |
@@ -136,7 +221,20 @@ def try_url(url_to_try, headers, timeout=15): |
136 | 221 | request=response.request |
137 | 222 | ) |
138 | 223 |
|
139 | | - elif response.status_code in [403, 409, 429, 503]: |
| 224 | + elif response.status_code == 403: |
| 225 | + print(f" [!] HTTP 403 - attempting enhanced extraction") |
| 226 | + |
| 227 | + redirect_url = extract_redirect_from_403(response, url_to_try) |
| 228 | + if redirect_url: |
| 229 | + print(f" [+] Found redirect URL in 403 response: {redirect_url}") |
| 230 | + return httpx.Response( |
| 231 | + status_code=200, |
| 232 | + headers={"location": redirect_url}, |
| 233 | + content=b"", |
| 234 | + request=response.request |
| 235 | + ) |
| 236 | + |
| 237 | + elif response.status_code in [409, 429, 503]: |
140 | 238 | print(f" [!] HTTP {response.status_code} - attempting to extract redirect info") |
141 | 239 |
|
142 | 240 | location = response.headers.get('location') |
@@ -194,15 +292,12 @@ def update_domain_entries(data): |
194 | 292 | print(f" [!] 'full_url' missing. Skipped.") |
195 | 293 | continue |
196 | 294 |
|
197 | | - ua = ua_generator.generate(device=('desktop', 'mobile'), browser=('chrome', 'edge', 'firefox', 'safari')) |
198 | | - current_headers = ua.headers.get() |
199 | | - |
200 | 295 | print(f" [] Stored URL: {original_full_url}") |
201 | 296 | if original_domain_in_entry: |
202 | 297 | print(f" [] Stored Domain (TLD): {original_domain_in_entry}") |
203 | 298 |
|
204 | 299 | print(f" [] Testing URL: {original_full_url}") |
205 | | - response = try_url(original_full_url, current_headers) |
| 300 | + response = try_url(original_full_url) |
206 | 301 |
|
207 | 302 | if response: |
208 | 303 | final_url_from_request = str(response.url) |
|
0 commit comments