Skip to content

Commit c3b73d7

Browse files
committed
部分方法重构,自动识别编码,洗文功能完善
1 parent 75022e6 commit c3b73d7

File tree

1 file changed

+54
-32
lines changed

1 file changed

+54
-32
lines changed

4.cool18-Article2epub/c2epub.py

Lines changed: 54 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import urllib
88

99
import bs4
10+
import chardet
1011
import html2epub
1112
import requests
1213

@@ -22,19 +23,19 @@
2223
}
2324

2425

25-
def to_str(bytes_or_str):
26-
if isinstance(bytes_or_str, bytes):
27-
value = bytes_or_str.decode('utf-8')
28-
else:
29-
value = bytes_or_str
30-
return value
26+
def find_all(a_str, sub):
27+
start = 0
28+
while True:
29+
start = a_str.find(sub, start)
30+
if start == -1:
31+
return
32+
yield start
33+
start += len(sub)
3134

3235

33-
def to_bytes(bytes_or_str):
34-
if isinstance(bytes_or_str, str):
35-
value = bytes_or_str.encode('utf-8')
36-
else:
37-
value = bytes_or_str
36+
def to_str(bytes_or_str):
37+
codec = chardet.detect(bytes_or_str)
38+
value = bytes_or_str.decode(encoding=codec['encoding'])
3839
return value
3940

4041

@@ -65,14 +66,26 @@ def fetch(url):
6566
L_END = '''<a name="postfp">'''
6667

6768

69+
def clean_title(_title):
70+
title = _title.replace(" - cool18.com", "")
71+
title = title.replace("/", "-")
72+
title = title.replace("\\", "-")
73+
title = title.replace("*", "-")
74+
title = title.replace("?", "-")
75+
title = title.replace("<", "-")
76+
title = title.replace(">", "-")
77+
title = title.replace("|", "-")
78+
title = title.replace(":", "-").strip()
79+
return title
80+
81+
6882
def extract_title(content, full=False):
6983
title_left = content.find('<title>')+len('<title>')
7084
title_right = content.find('</title>')
7185
title = content[title_left:title_right]
7286

7387
if (full):
74-
title = title.replace(" - cool18.com", "").replace("/",
75-
"-").replace("\\", "-").strip()
88+
title = clean_title(title)
7689
else:
7790
title_search = re.search('[《](.*?)[》]', title, re.IGNORECASE)
7891
if title_search:
@@ -82,12 +95,18 @@ def extract_title(content, full=False):
8295
if title_search:
8396
title = title_search.group(1)
8497
else:
85-
title = title.replace(
86-
" - cool18.com", "").replace("/", "-").replace("\\", "-").strip()
87-
98+
title = clean_title(title)
8899
return title
89100

90101

102+
def should_ignore_this_link(_title):
103+
iwords = ["银元奖励", "无内容", "版块基金", " 给 ", "幸运红包"]
104+
for i in iwords:
105+
if i in _title:
106+
return True
107+
return False
108+
109+
91110
def loadConfig():
92111
cf = configparser.ConfigParser()
93112
try:
@@ -117,7 +136,6 @@ def download(url):
117136
print('+%s' % title)
118137

119138
# REMOVE BLANKS
120-
121139
raw = str(src)
122140

123141
try:
@@ -147,7 +165,7 @@ def download(url):
147165
comm_soup = bs4.BeautifulSoup(comments, "lxml")
148166
for a in comm_soup.find_all('a'):
149167
_title = a.getText()
150-
if ('银元奖励' in _title) or ('无内容' in _title) or ('版块基金' in _title) or (' 给 ' in _title) or ('幸运红包' in _title):
168+
if should_ignore_this_link(_title):
151169
continue
152170
#print('+%s' % _title)
153171
_u = a.get('href')
@@ -164,15 +182,18 @@ def download(url):
164182
return
165183

166184
[s.extract() for s in content_soup('script')]
167-
185+
# Wash Text
168186
page_content = str(content_soup.find('body').getText())
187+
page_content = page_content.replace(" ", "")
188+
page_content = page_content.replace("  ", "@@@@@@@@")
169189
page_content = page_content.replace(os.linesep, "@@@@@@@@")
170-
page_content = page_content.replace(
171-
'cool18.com', '@@@@@@@@').replace('www.6park.com', '').replace('6park.com', '')
190+
page_content = page_content.replace("\n", "@@@@@@@@")
191+
page_content = page_content.replace('cool18.com', '@@@@@@@@')
192+
page_content = page_content.replace('www.6park.com', '')
193+
page_content = page_content.replace('6park.com', '')
172194
page_content = page_content.replace("@@@@@@@@@@@@@@@@", "</p><p>")
173-
page_content = page_content.replace("@@@@@@@@", "")
174-
page_content = page_content.replace(" ", "")
175-
page_content = page_content.replace(" ", "")
195+
page_content = page_content.replace("@@@@@@@@", "</p><p>")
196+
page_content = page_content.replace("</p><p></p><p>", "</p><p>")
176197

177198
try:
178199
last_pos = page_content.rindex('评分完成')
@@ -183,8 +204,11 @@ def download(url):
183204
if (len(page_content.strip()) > int(config['minContent'])):
184205
try:
185206
with open("%s-%s.html" % (tid, title), 'w+', encoding='utf-8', errors='ignore') as file:
186-
file.write(
187-
r'<?xml version="1.0" encoding="utf-8" standalone="no"?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"><html xmlns="http://www.w3.org/1999/xhtml" xml:lang="zh-CN"><head><title>')
207+
file.write("""<?xml version="1.0" encoding="utf-8" standalone="no"?>
208+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
209+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="zh-CN">
210+
<head>
211+
<title>""")
188212
file.write(title)
189213
file.write(r"</title></head><body><p>")
190214
file.write(page_content)
@@ -197,10 +221,8 @@ def download(url):
197221
if __name__ == '__main__':
198222
args_length = len(sys.argv)
199223
url = None
200-
201224
if (args_length > 1):
202225
url = sys.argv[1]
203-
204226
if (not url):
205227
url = str(input("请粘贴cool18站的文章网址:"))
206228
loadConfig()
@@ -225,14 +247,14 @@ def download(url):
225247
if (current_url in downloaded):
226248
pass
227249
else:
228-
print(r"~[%3d]%s" % (len(hive), current_url))
250+
print(r"~[%2d]%s" % (len(hive), current_url))
229251
download(current_url)
230252
downloaded.add(current_url)
231-
253+
print(">Download completed.")
232254
if config['waitPackage'] == 'yes':
233-
input('>Press Enter when ready...')
255+
input('>Press Enter to pack files into epub...')
234256

235-
print(">Download completed, now packaging epub...")
257+
print(">now packaging epub...")
236258
epub = html2epub.Epub(title, language="zh-cn",
237259
creator="cool18", publisher="cool18")
238260
for file in os.listdir("."):

0 commit comments

Comments
 (0)