-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathQJZCrawler.py
More file actions
109 lines (95 loc) · 4.3 KB
/
QJZCrawler.py
File metadata and controls
109 lines (95 loc) · 4.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 1 09:53:22 2020
@author: KakaHiguain@BDWM
"""
import os
import re
from bs4 import BeautifulSoup
class QJZCrawler:
_WMQJZ_BOARD_NAME = 'WMQJZ'
_MAX_PAGES_TO_SEARCH = 6
def __init__(self, bdwm, year, month, day):
self._bdwm = bdwm
self._date = month + day
self._seed_filename = '{}{}.txt'.format(year, self._date)
if os.path.exists(self._seed_filename):
os.remove(self._seed_filename)
@classmethod
def _extract_from_post_page(cls, post_page):
"""Return username and post content lines from post_page html."""
post_soup = BeautifulSoup(post_page, features="html.parser")
username_body = post_soup.find('div', attrs={'class': 'post-owner'}) \
.find('p', attrs={'class': 'username'})
content_body = post_soup.find('div', attrs={'class': 'body file-read image-click-view'})
post_lines = []
for line in content_body.find_all('p'):
post_lines.append(line.string.replace('\xa0', ' ') if line.string else '')
return username_body.a.string, post_lines
@classmethod
def _output_into_file(cls, f, username, post_title, post_lines):
"""Output post info to seed file."""
lines = [
"发信人: {}".format(username),
"标 题: {}".format(post_title),
"发信站:",
"",
]
lines.extend(post_lines)
lines.append('--')
last_line = ''
for line in lines:
# Skip "comment" lines.
if line.startswith('#') or line.startswith('//'):
continue
# Avoid writing two consecutive blank lines.
if last_line or line:
# Need to use \r\n for windows linefeed.
f.write(line + '\r\n')
last_line = line
def generate_seed_file(self):
"""Generate seed file according to the posts on WMQJZ,
return whether generate successfully."""
section_set = set()
postid_list = []
for page in range(1, self._MAX_PAGES_TO_SEARCH + 1):
board_page = self._bdwm.get_board_page(self._WMQJZ_BOARD_NAME, page, mode='single')
board_soup = BeautifulSoup(board_page, features="html.parser")
for post in board_soup.find_all('div', attrs={"class": "list-item-single list-item"}):
postid = post['data-itemid']
if int(postid) < 0:
continue
post_title = post.find('div', attrs={"class": "title l limit"}) \
.string.replace('\xa0', ' ')
pattern = r'^([0-9A-DF-H])@' + self._date
match_res = re.search(pattern, post_title)
if not match_res:
continue
# 0-9, A, B, C, D, F, G, H
section = match_res.group(1)
print("找到{},帖子id:{}".format(post_title, postid))
if section in {"D", "F"}:
print("暂不支持采{}区,跳过该帖子~".format(section))
continue
if section in section_set:
print("{}区已经存在,跳过该帖子~".format(section))
continue
section_set.add(section)
postid_list.append(postid)
post_page = self._bdwm.get_single_post_page(self._WMQJZ_BOARD_NAME, postid)
username, post_lines = QJZCrawler._extract_from_post_page(post_page)
print("{}区采编:{}".format(section, username))
with open(self._seed_filename, "a", newline="") as f:
QJZCrawler._output_into_file(f, username, post_title, post_lines)
missing_sections = []
for section_num in range(10):
if str(section_num) not in section_set:
missing_sections.append(section_num)
if missing_sections:
print("以下分区未能找到帖子:{}".format(str(missing_sections)))
return False
print("已生成种子文件{},请查看~".format(self._seed_filename))
self._bdwm.operate_post(self._WMQJZ_BOARD_NAME, postid_list, 'digest')
print("已将找到的各区帖子g上。")
return True