-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathurl_extractor.py
More file actions
277 lines (220 loc) · 9.34 KB
/
url_extractor.py
File metadata and controls
277 lines (220 loc) · 9.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
URL提取和清理工具
专门处理包含HTML标签的伪URL,提取真正的订阅链接
"""
import re
import urllib.parse
from urllib.parse import urlparse, unquote
from typing import List, Set
import logging
import html
class URLExtractor:
"""URL提取和清理器"""
def __init__(self):
self.logger = logging.getLogger(__name__)
# 订阅链接的正则表达式模式
self.subscription_patterns = [
# 标准格式:https://domain.com/api/v1/client/subscribe?token=xxx
r'https?://[^\s"\'<>]+api/v1/client/subscribe\?token=[A-Za-z0-9]+',
# 包含HTML标签的格式
r'<code>https?://[^\s"\'<>]+api/v1/client/subscribe\?token=[A-Za-z0-9]+</code>',
# 包含引号的格式
r'["\']https?://[^\s"\'<>]+api/v1/client/subscribe\?token=[A-Za-z0-9]+["\']',
# 更宽松的匹配,包含可能的额外参数,但限制在合理范围内
r'https?://[^\s"\'<>]+api/v1/client/subscribe\?token=[A-Za-z0-9]+(?:&[^=\s"\'<>]*=[^=\s"\'<>]*)*',
# 专门处理包含flag参数的URL
r'https?://[^\s"\'<>]+api/v1/client/subscribe\?token=[A-Za-z0-9]+&flag=[A-Za-z0-9]+',
# 新增:其他常见订阅格式
r'https?://[^\s"\'<>]+/subscribe/link\?token=[A-Za-z0-9]+',
r'https?://[^\s"\'<>]+/getSubscribe\?token=[A-Za-z0-9]+',
r'https?://[^\s"\'<>]+/sub\?target=[A-Za-z0-9]+&url=[^\s"\'<>]+',
r'https?://[^\s"\'<>]+/link/[A-Za-z0-9]+(?:\?[^\s"\'<>]*)?',
r'https?://[^\s"\'<>]+/s/[A-Za-z0-9]+',
# Base64订阅链接
r'(?:vmess|vless|trojan|ss|ssr|hysteria2?)://[A-Za-z0-9+/=]+',
# 短链接服务
r'https?://(?:bit\.ly|goo\.gl|tinyurl\.com|t\.co|short\.link)/[A-Za-z0-9]+',
]
# 需要清理的HTML标签和属性
self.html_cleanup_patterns = [
r'<[^>]+>', # 移除所有HTML标签
r'&[a-zA-Z0-9#]+;', # 移除HTML实体
r'%[0-9A-Fa-f]{2}', # 移除URL编码
]
def extract_subscription_urls(self, text: str) -> List[str]:
"""
从文本中提取所有订阅链接
Args:
text: 包含订阅链接的文本
Returns:
List[str]: 提取到的订阅链接列表
"""
urls = set()
# 使用多种模式匹配
for pattern in self.subscription_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
for match in matches:
# 清理HTML标签
clean_url = self.clean_html_tags(match)
# 进一步清理和验证
clean_url = self.clean_and_validate_url(clean_url)
if clean_url:
urls.add(clean_url)
return list(urls)
def clean_html_tags(self, text: str) -> str:
"""
清理HTML标签和实体
Args:
text: 包含HTML的文本
Returns:
str: 清理后的文本
"""
# 先处理嵌套的HTML实体编码(如 &amp; -> &)
while '&' in text:
text = text.replace('&', '&')
# 再使用html.unescape处理其他HTML实体
text = html.unescape(text)
# 再使用unquote处理URL编码
text = unquote(text)
# 移除HTML标签
for pattern in self.html_cleanup_patterns:
text = re.sub(pattern, '', text)
return text.strip()
def clean_and_validate_url(self, url: str) -> str:
"""
清理和验证URL
Args:
url: 原始URL
Returns:
str: 清理后的有效URL,如果无效则返回None
"""
try:
# 移除前后空白字符
url = url.strip()
# 移除可能的HTML标签残留
url = re.sub(r'<[^>]*>', '', url)
# 移除可能的引号
url = url.strip('"\'')
# 检查是否包含必要的部分
if 'api/v1/client/subscribe?token=' not in url:
return None
# 使用正则表达式提取纯URL部分,去除后面的额外文本
# 匹配到第一个中文字符或空格之前,但允许&参数
url_match = re.match(r'(https?://[^\s"\'<>]+api/v1/client/subscribe\?token=[A-Za-z0-9]+(?:&[^一-龯\s]*)?)', url)
if url_match:
url = url_match.group(1)
else:
# 如果没有参数,匹配基本URL
url_match = re.match(r'(https?://[^\s"\'<>]+api/v1/client/subscribe\?token=[A-Za-z0-9]+)', url)
if url_match:
url = url_match.group(1)
# 解析URL
parsed = urlparse(url)
# 验证URL格式
if not parsed.scheme or not parsed.netloc:
return None
# 确保是http或https
if parsed.scheme not in ['http', 'https']:
return None
# 重新构建URL,确保格式正确
clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
if parsed.query:
clean_url += f"?{parsed.query}"
return clean_url
except Exception as e:
self.logger.debug(f"URL清理失败: {url} - {e}")
return None
def extract_from_search_results(self, search_results: List[dict]) -> List[str]:
"""
从搜索结果中提取订阅链接
Args:
search_results: 搜索结果列表
Returns:
List[str]: 提取到的订阅链接列表
"""
all_urls = set()
for result in search_results:
# 从链接字段提取
link = result.get('link', '')
if link:
urls = self.extract_subscription_urls(link)
all_urls.update(urls)
# 从标题字段提取
title = result.get('title', '')
if title:
urls = self.extract_subscription_urls(title)
all_urls.update(urls)
# 从摘要字段提取
snippet = result.get('snippet', '')
if snippet:
urls = self.extract_subscription_urls(snippet)
all_urls.update(urls)
return list(all_urls)
def process_mixed_urls(self, urls: List[str]) -> List[str]:
"""
处理混合的URL列表(包含真实URL和伪URL)
Args:
urls: 混合的URL列表
Returns:
List[str]: 清理后的真实URL列表
"""
clean_urls = set()
for url in urls:
# 如果已经是干净的URL,直接添加
if self.is_clean_url(url):
clean_urls.add(url)
else:
# 尝试提取订阅链接
extracted = self.extract_subscription_urls(url)
clean_urls.update(extracted)
return list(clean_urls)
def is_clean_url(self, url: str) -> bool:
"""
检查URL是否是干净的(不包含HTML标签)
Args:
url: 要检查的URL
Returns:
bool: 是否是干净的URL
"""
# 检查是否包含HTML标签
if re.search(r'<[^>]+>', url):
return False
# 检查是否包含HTML实体
if re.search(r'&[a-zA-Z0-9#]+;', url):
return False
# 检查是否包含明显的HTML内容
html_indicators = ['<code>', '</code>', '<br/>', '<div', '</div>', '<', '>']
for indicator in html_indicators:
if indicator in url:
return False
return True
def test_url_extractor():
"""测试URL提取器"""
print("🧪 测试URL提取器")
print("=" * 50)
extractor = URLExtractor()
# 测试用例
test_cases = [
# 伪URL示例
'https://t.me/>订阅链接:<code>https://daka778.top/api/v1/client/subscribe?token=1b4d963259351e7719fdb6ce4276cf6d</code><br/>订阅流量:<code>100 GB</code>',
# 包含引号的URL
'"https://example.com/api/v1/client/subscribe?token=abc123"',
# 干净的URL
'https://daka778.top/api/v1/client/subscribe?token=1b4d963259351e7719fdb6ce4276cf6d',
# 包含HTML实体的URL
'https://example.com/api/v1/client/subscribe?token=abc123&flag=clash',
# 混合内容
'Some text https://test.com/api/v1/client/subscribe?token=xyz789 more text',
]
for i, test_case in enumerate(test_cases, 1):
print(f"\n测试用例 {i}:")
print(f"输入: {test_case[:100]}...")
urls = extractor.extract_subscription_urls(test_case)
print(f"提取结果: {urls}")
for url in urls:
is_clean = extractor.is_clean_url(url)
print(f" - {url} (干净: {is_clean})")
if __name__ == "__main__":
test_url_extractor()