Skip to content

Commit 1feb9a8

Browse files
authored
获取今日帖子时屏蔽置顶帖 (#48)
重构了get_today_list函数
1 parent 700a36d commit 1feb9a8

File tree

1 file changed

+43
-60
lines changed

1 file changed

+43
-60
lines changed

AutoReply.py

Lines changed: 43 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
DEBUG = False
1414

15-
__verison__ = "0.25.02.25.1"
15+
__verison__ = "0.25.04.06.1"
1616

1717
def outputLog(projectName):
1818
log = logging.getLogger(f"{projectName}")
@@ -436,68 +436,51 @@ def get_today_list(self):
436436
res = requests.get(self.Today , headers = self.Headers, proxies = proxies)
437437
content = res.text
438438

439-
pat_title : str = ('htm_data/\w+/\w+/\w+.html')
440-
pat_moderator : str = "版主:([\s\S]*?)<\/span>"
441-
pat_username : str = "username=(\w+)"
442-
pat_user : str = 'class="bl">(.*)?</a>'
443-
pat_all_title : str = '<h3><a href="/([\s\S]*?)"'
444-
pat_all_content : str = '<h3><a href=".*" target="_blank" id=".*">(.*)<\/a><\/h3>'
445-
moderator : str = re.search(pat_moderator, content).group(0)
446-
username : List = re.findall(pat_username, moderator)
447-
content = res.text[res.text.find('普通主題'):]
448-
all_username : List = re.findall(pat_user, content)
449-
title = re.findall(pat_title , content)
450-
all_title = re.findall(pat_all_title , content)
451-
all_content = re.findall(pat_all_content , content)
452-
453-
log.debug(f"{self.username} get list number: {str(len(title))}")
454-
455-
if len(all_title) != len(all_username):
456-
if self.RetryList > 0:
457-
log.debug(f"{self.username} get list number error , retry get list , remaining retry times: %d" % self.RetryList)
458-
self.RetryList -= 1
459-
sleep_time = random.randint(6,60)
460-
log.debug(f"{self.username} sleep {sleep_time} seconds")
461-
sleep(sleep_time)
462-
self.get_today_list()
463-
return
464-
else:
465-
self.set_invalid()
466-
self.s.close()
467-
return
439+
# 版主列表 moderator
440+
moderator : List = re.findall(r"username=(\w+)", re.search(r"版主:([\s\S]*?)<\/span>", content).group(0))
441+
442+
# 提取出所有普通主題
443+
content = re.search(r'<tbody style="table-layout:fixed;" id="tbody">(.*?)</tbody>', content, re.DOTALL).group(0)
468444

445+
# 提取出每一个主题, 每个主题以 <tr class="tr3 t_one tac"> ... </tr> 为一组
446+
all_threads = []
447+
tr_pattern = re.compile(r'<tr class="tr3 t_one tac">(.*?)</tr>',re.DOTALL)
448+
449+
# 遍历每个匹配到的<tr>块
450+
for tr_block in tr_pattern.findall(content):
451+
# 屏蔽置顶帖
452+
if "Top-marks" in tr_block:continue
453+
454+
# 提取所有<td>元素
455+
td_matches = re.findall(r'<td.*?>(.*?)</td>', tr_block, re.DOTALL)
456+
457+
# 提取 url 和 title(第二个<td>)
458+
url_title_match = re.search(r'<h3>.*?<a href="/(.*?)".*?>(.*?)</a>.*?</h3>', td_matches[1], re.DOTALL)
459+
460+
url = url_title_match.group(1)
461+
title = re.sub(r'<.*?>', '', url_title_match.group(2)).strip() # 去除HTML标签
462+
463+
# 提取 author(第三个<td>)
464+
author = re.search(r'<a href=".*?" class="bl">(.*?)</a>', td_matches[2], re.DOTALL).group(1).strip()
465+
466+
all_threads.append({
467+
'url': url,
468+
'title': title,
469+
'author': author
470+
})
471+
472+
log.debug(f"{self.username} get list number: {str(len(all_threads))}")
469473

470474
if Forbid:
471-
black_list : List = []
472-
log.debug("moderator list: " + str(" ".join(username)))
473-
for index in range(len(all_username)):
474-
if all_username[index].strip() in moderator:
475-
black_list.append(all_title[index])
476-
for item in black_list:
477-
try:
478-
title.remove(item)
479-
log.debug(f"{self.username} remove {item} from list")
480-
except Exception as e:
481-
log.error(f"{self.username} remove {item} from list 失败, 错误类型: {type(e).__name__} 描述: {e}")
482-
483-
black_list : List = []
484-
log.debug(f"{self.username} 排除: {self.excludeContent}")
485-
for index in range(len(all_content)):
486-
content = all_content[index]
487-
for item in self.excludeContent:
488-
if item in content:
489-
black_list.append(all_title[index])
490-
break
491-
492-
for item in black_list:
493-
try:
494-
title.remove(item)
495-
log.debug(f"{self.username} remove {item} from list")
496-
except Exception as e:
497-
log.error(f"{self.username} remove {item} from list 失败, 错误类型: {type(e).__name__} 描述: {e}")
498-
499-
self.ReplyList = title
500-
log.debug(f"{self.username} get reply list number {str(len(title))}")
475+
log.debug("moderator list: " + str(" ".join(moderator)))
476+
do_not_reply = [thread for thread in all_threads if (thread["author"] in moderator) or (thread["title"] in self.excludeContent)]
477+
478+
for item in do_not_reply:
479+
all_threads.remove(item)
480+
log.debug(f"{self.username} remove {item['title']} from list")
481+
482+
self.ReplyList = [item["url"] for item in all_threads]
483+
log.debug(f"{self.username} get reply list number {str(len(self.ReplyList))}")
501484

502485
#从今日列表中抽取出一个帖子
503486
def get_one_link(self) -> Union[str , None]:

0 commit comments

Comments
 (0)