|
12 | 12 |
|
13 | 13 | DEBUG = False |
14 | 14 |
|
15 | | -__verison__ = "0.25.02.25.1" |
| 15 | +__verison__ = "0.25.04.06.1" |
16 | 16 |
|
17 | 17 | def outputLog(projectName): |
18 | 18 | log = logging.getLogger(f"{projectName}") |
@@ -436,68 +436,51 @@ def get_today_list(self): |
436 | 436 | res = requests.get(self.Today , headers = self.Headers, proxies = proxies) |
437 | 437 | content = res.text |
438 | 438 |
|
439 | | - pat_title : str = ('htm_data/\w+/\w+/\w+.html') |
440 | | - pat_moderator : str = "版主:([\s\S]*?)<\/span>" |
441 | | - pat_username : str = "username=(\w+)" |
442 | | - pat_user : str = 'class="bl">(.*)?</a>' |
443 | | - pat_all_title : str = '<h3><a href="/([\s\S]*?)"' |
444 | | - pat_all_content : str = '<h3><a href=".*" target="_blank" id=".*">(.*)<\/a><\/h3>' |
445 | | - moderator : str = re.search(pat_moderator, content).group(0) |
446 | | - username : List = re.findall(pat_username, moderator) |
447 | | - content = res.text[res.text.find('普通主題'):] |
448 | | - all_username : List = re.findall(pat_user, content) |
449 | | - title = re.findall(pat_title , content) |
450 | | - all_title = re.findall(pat_all_title , content) |
451 | | - all_content = re.findall(pat_all_content , content) |
452 | | - |
453 | | - log.debug(f"{self.username} get list number: {str(len(title))}") |
454 | | - |
455 | | - if len(all_title) != len(all_username): |
456 | | - if self.RetryList > 0: |
457 | | - log.debug(f"{self.username} get list number error , retry get list , remaining retry times: %d" % self.RetryList) |
458 | | - self.RetryList -= 1 |
459 | | - sleep_time = random.randint(6,60) |
460 | | - log.debug(f"{self.username} sleep {sleep_time} seconds") |
461 | | - sleep(sleep_time) |
462 | | - self.get_today_list() |
463 | | - return |
464 | | - else: |
465 | | - self.set_invalid() |
466 | | - self.s.close() |
467 | | - return |
| 439 | + # 版主列表 moderator |
| 440 | + moderator : List = re.findall(r"username=(\w+)", re.search(r"版主:([\s\S]*?)<\/span>", content).group(0)) |
| 441 | + |
| 442 | + # 提取出所有普通主題 |
| 443 | + content = re.search(r'<tbody style="table-layout:fixed;" id="tbody">(.*?)</tbody>', content, re.DOTALL).group(0) |
468 | 444 |
|
| 445 | + # 提取出每一个主题, 每个主题以 <tr class="tr3 t_one tac"> ... </tr> 为一组 |
| 446 | + all_threads = [] |
| 447 | + tr_pattern = re.compile(r'<tr class="tr3 t_one tac">(.*?)</tr>',re.DOTALL) |
| 448 | + |
| 449 | + # 遍历每个匹配到的<tr>块 |
| 450 | + for tr_block in tr_pattern.findall(content): |
| 451 | + # 屏蔽置顶帖 |
| 452 | + if "Top-marks" in tr_block:continue |
| 453 | + |
| 454 | + # 提取所有<td>元素 |
| 455 | + td_matches = re.findall(r'<td.*?>(.*?)</td>', tr_block, re.DOTALL) |
| 456 | + |
| 457 | + # 提取 url 和 title(第二个<td>) |
| 458 | + url_title_match = re.search(r'<h3>.*?<a href="/(.*?)".*?>(.*?)</a>.*?</h3>', td_matches[1], re.DOTALL) |
| 459 | + |
| 460 | + url = url_title_match.group(1) |
| 461 | + title = re.sub(r'<.*?>', '', url_title_match.group(2)).strip() # 去除HTML标签 |
| 462 | + |
| 463 | + # 提取 author(第三个<td>) |
| 464 | + author = re.search(r'<a href=".*?" class="bl">(.*?)</a>', td_matches[2], re.DOTALL).group(1).strip() |
| 465 | + |
| 466 | + all_threads.append({ |
| 467 | + 'url': url, |
| 468 | + 'title': title, |
| 469 | + 'author': author |
| 470 | + }) |
| 471 | + |
| 472 | + log.debug(f"{self.username} get list number: {str(len(all_threads))}") |
469 | 473 |
|
470 | 474 | if Forbid: |
471 | | - black_list : List = [] |
472 | | - log.debug("moderator list: " + str(" ".join(username))) |
473 | | - for index in range(len(all_username)): |
474 | | - if all_username[index].strip() in moderator: |
475 | | - black_list.append(all_title[index]) |
476 | | - for item in black_list: |
477 | | - try: |
478 | | - title.remove(item) |
479 | | - log.debug(f"{self.username} remove {item} from list") |
480 | | - except Exception as e: |
481 | | - log.error(f"{self.username} remove {item} from list 失败, 错误类型: {type(e).__name__} 描述: {e}") |
482 | | - |
483 | | - black_list : List = [] |
484 | | - log.debug(f"{self.username} 排除: {self.excludeContent}") |
485 | | - for index in range(len(all_content)): |
486 | | - content = all_content[index] |
487 | | - for item in self.excludeContent: |
488 | | - if item in content: |
489 | | - black_list.append(all_title[index]) |
490 | | - break |
491 | | - |
492 | | - for item in black_list: |
493 | | - try: |
494 | | - title.remove(item) |
495 | | - log.debug(f"{self.username} remove {item} from list") |
496 | | - except Exception as e: |
497 | | - log.error(f"{self.username} remove {item} from list 失败, 错误类型: {type(e).__name__} 描述: {e}") |
498 | | - |
499 | | - self.ReplyList = title |
500 | | - log.debug(f"{self.username} get reply list number {str(len(title))}") |
| 475 | + log.debug("moderator list: " + str(" ".join(moderator))) |
| 476 | + do_not_reply = [thread for thread in all_threads if (thread["author"] in moderator) or (thread["title"] in self.excludeContent)] |
| 477 | + |
| 478 | + for item in do_not_reply: |
| 479 | + all_threads.remove(item) |
| 480 | + log.debug(f"{self.username} remove {item['title']} from list") |
| 481 | + |
| 482 | + self.ReplyList = [item["url"] for item in all_threads] |
| 483 | + log.debug(f"{self.username} get reply list number {str(len(self.ReplyList))}") |
501 | 484 |
|
502 | 485 | #从今日列表中抽取出一个帖子 |
503 | 486 | def get_one_link(self) -> Union[str , None]: |
|
0 commit comments