Skip to content

Commit 07541b8

Browse files
committed
multithread
1 parent c3b73d7 commit 07541b8

File tree

3 files changed

+355
-0
lines changed

3 files changed

+355
-0
lines changed
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Download cool18.com article and package to epub (multithread version)
2+
3+
### Project files
4+
5+
```
6+
c2epub.py --> the main executor
7+
config.ini --> config file
8+
```
9+
10+
### How to use
11+
12+
1. clone or download this repo
13+
2. navigate to this folder
14+
3. ```pip3 install bs4 lxml requests pysocks html2epub```
15+
4. edit config.txt to set the proxy and other options
16+
5. ```./c2epub.py "article url here"```
17+
18+
### NSFW WARNING
19+
20+
This repo is used to download and package some adult novels, if you're under 18, please leave.
21+
22+
### RESTRICTIONS
23+
24+
Old posts can't be processed, for example: http://www.cool18.com/bbs4/index.php?app=forum&act=threadview&tid=13864396
Lines changed: 314 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,314 @@
1+
#!/usr/bin/python3
2+
import configparser
3+
import os
4+
import queue
5+
import re
6+
import shutil
7+
import sys
8+
import threading
9+
import time
10+
import urllib
11+
12+
import bs4
13+
import chardet
14+
import html2epub
15+
import requests
16+
17+
# requires: requests bs4 lxml pysocks html2epub
18+
19+
config = {
20+
"enableProxy": "no",
21+
"proxy": "socks5://127.0.0.1:1081",
22+
"minContent": 1000,
23+
"waitPackage": "no",
24+
"autoDelete": "yes",
25+
"verifyCert": "yes",
26+
"threads": 3
27+
}
28+
29+
30+
def find_all(a_str, sub):
31+
start = 0
32+
while True:
33+
start = a_str.find(sub, start)
34+
if start == -1:
35+
return
36+
yield start
37+
start += len(sub)
38+
39+
40+
def to_str(bytes_or_str):
41+
codec = chardet.detect(bytes_or_str)
42+
value = bytes_or_str.decode(encoding=codec['encoding'])
43+
return value
44+
45+
46+
def fetch(url):
47+
48+
if config['enableProxy'] == 'yes':
49+
proxy = config['proxy']
50+
proxies = dict(http=proxy, https=proxy)
51+
try:
52+
resp = requests.get(url, proxies=proxies, verify=(
53+
config['verifyCert'] == 'yes'))
54+
src = to_str(resp.content)
55+
return src
56+
finally:
57+
pass
58+
else:
59+
try:
60+
resp = requests.get(url)
61+
src = to_str(resp.content)
62+
return src
63+
except:
64+
return ""
65+
66+
67+
P_START = "<!--bodybegin-->"
68+
P_END = "<!--bodyend-->"
69+
L_START = '''<a name="followups" style=''>'''
70+
L_END = '''<a name="postfp">'''
71+
72+
73+
def clean_title(_title):
74+
title = _title.replace(" - cool18.com", "")
75+
title = title.replace("/", "-")
76+
title = title.replace("\\", "-")
77+
title = title.replace("*", "-")
78+
title = title.replace("?", "-")
79+
title = title.replace("<", "-")
80+
title = title.replace(">", "-")
81+
title = title.replace("|", "-")
82+
title = title.replace(":", "-").strip()
83+
return title
84+
85+
86+
def extract_title(content, full=False):
87+
title_left = content.find('<title>')+len('<title>')
88+
title_right = content.find('</title>')
89+
title = content[title_left:title_right]
90+
91+
if (full):
92+
title = clean_title(title)
93+
else:
94+
title_search = re.search('[《](.*?)[》]', title, re.IGNORECASE)
95+
if title_search:
96+
title = title_search.group(1)
97+
else:
98+
title_search = re.search('[【](.*?)[】]', title, re.IGNORECASE)
99+
if title_search:
100+
title = title_search.group(1)
101+
else:
102+
title = clean_title(title)
103+
return title
104+
105+
106+
def should_ignore_this_link(_title):
107+
iwords = ["银元奖励", "无内容", "版块基金", " 给 ", "幸运红包"]
108+
for i in iwords:
109+
if i in _title:
110+
return True
111+
return False
112+
113+
114+
def loadConfig():
115+
cf = configparser.ConfigParser()
116+
try:
117+
cf.read('config.ini')
118+
config['enableProxy'] = cf.get('network', 'enableProxy')
119+
config['proxy'] = cf.get('network', 'proxy')
120+
config['minContent'] = cf.get('config', 'minContent')
121+
config['waitPackage'] = cf.get('config', 'waitPackage')
122+
config['verifyCert'] = cf.get('network', 'verifyCert')
123+
config['threads'] = cf.get('config', 'threads')
124+
requests.packages.urllib3.disable_warnings()
125+
except:
126+
pass
127+
128+
129+
def download(url,threadname):
130+
131+
uri = urllib.parse.urlparse(url)
132+
params = urllib.parse.parse_qs(uri.query)
133+
134+
tid = params['tid']
135+
if not tid:
136+
return
137+
138+
src = fetch(url)
139+
title = extract_title(src, full=True)
140+
print(f'{threadname}:GOT {title}')
141+
# REMOVE BLANKS
142+
raw = str(src)
143+
144+
try:
145+
pos_start = raw.find(P_START)+len(P_START)
146+
pos_end = raw.find(P_END)
147+
page_content = raw[pos_start:pos_end]
148+
content_soup = bs4.BeautifulSoup(page_content, "lxml")
149+
# extract in page chapters
150+
links = content_soup.find_all('a')
151+
for a in links:
152+
_title = a.getText()
153+
#print('+%s' % _title)
154+
_url = a.get('href')
155+
if (_url and len(_url.strip()) > 8):
156+
if config['host'] in _url and not (_url in downloaded):
157+
workqueue.put(_url)
158+
a.extract()
159+
except ValueError:
160+
return
161+
162+
try:
163+
# extract below links
164+
lpos_start = raw.find(L_START)+len(L_START)
165+
lpos_end = raw.find(L_END)
166+
comments = raw[lpos_start:lpos_end]
167+
comm_soup = bs4.BeautifulSoup(comments, "lxml")
168+
for a in comm_soup.find_all('a'):
169+
_title = a.getText()
170+
if should_ignore_this_link(_title):
171+
continue
172+
#print('+%s' % _title)
173+
_u = a.get('href')
174+
if (_u and _u.startswith("http")):
175+
if config['host'] in _u and not (_u in downloaded):
176+
workqueue.put(_u)
177+
else:
178+
_u = config['host'] + _u
179+
if config['host'] in _u and not (_u in downloaded):
180+
workqueue.put(_u)
181+
except ValueError:
182+
pass
183+
downloading.pop()
184+
# SKIP DOWNLOADED FILES
185+
if (os.path.exists("%s-%s.html" % (tid, title))):
186+
print(f"{threadname}:SKP {tid}-{title}.html" , file=sys.stderr)
187+
return
188+
189+
[s.extract() for s in content_soup('script')]
190+
# Wash Text
191+
page_content = str(content_soup.find('body').getText())
192+
page_content = page_content.replace(" ", "")
193+
page_content = page_content.replace("  ", "@@@@@@@@")
194+
page_content = page_content.replace(os.linesep, "@@@@@@@@")
195+
page_content = page_content.replace("\n", "@@@@@@@@")
196+
page_content = page_content.replace('cool18.com', '@@@@@@@@')
197+
page_content = page_content.replace('www.6park.com', '')
198+
page_content = page_content.replace('6park.com', '')
199+
page_content = page_content.replace("@@@@@@@@@@@@@@@@", "</p><p>")
200+
page_content = page_content.replace("@@@@@@@@", "</p><p>")
201+
page_content = page_content.replace("</p><p></p><p>", "</p><p>")
202+
203+
try:
204+
last_pos = page_content.rindex('评分完成')
205+
page_content = page_content[:last_pos]
206+
except ValueError:
207+
pass
208+
209+
if (len(page_content.strip()) > int(config['minContent'])):
210+
try:
211+
with open("%s-%s.html" % (tid, title), 'w+', encoding='utf-8', errors='ignore') as file:
212+
file.write("""<?xml version="1.0" encoding="utf-8" standalone="no"?>
213+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
214+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="zh-CN">
215+
<head>
216+
<title>""")
217+
file.write(title)
218+
file.write(r"</title></head><body><p>")
219+
file.write(page_content)
220+
file.write(r"</p></body></html>")
221+
except:
222+
print(f"{threadname}:Error writing {title}", file=sys.stderr)
223+
else:
224+
print(f'{threadname}:IGN {title}')
225+
# add to downloaded
226+
downloaded.add(url)
227+
228+
229+
class fetcher(threading.Thread):
230+
def __init__(self, name, q):
231+
threading.Thread.__init__(self)
232+
self.name = name
233+
self.q = q
234+
self.daemon=True
235+
236+
def run(self):
237+
while not exitflag:
238+
url = None
239+
threadlock.acquire()
240+
if (not workqueue.empty()):
241+
url = workqueue.get()
242+
threadlock.release()
243+
if (url):
244+
downloading.append(url)
245+
download(url,self.name)
246+
247+
248+
workqueue = queue.Queue()
249+
threads = []
250+
downloaded = set()
251+
threadlock = threading.Lock()
252+
exitflag = 0
253+
downloading = []
254+
255+
# Main Logic
256+
if __name__ == '__main__':
257+
args_length = len(sys.argv)
258+
url = None
259+
if (args_length > 1):
260+
url = sys.argv[1]
261+
if (not url):
262+
url = str(input("请粘贴cool18站的文章网址:"))
263+
loadConfig()
264+
pypath = sys.argv[0]
265+
pydir = os.getcwd()
266+
267+
config['host'] = url[:url.rindex('/')+1]
268+
269+
src = fetch(url)
270+
title = extract_title(src)
271+
272+
if not os.path.exists(title):
273+
os.mkdir(title)
274+
os.chdir(title)
275+
exitflag = 0
276+
277+
tid = 0
278+
# Init Q
279+
workqueue.put(url)
280+
281+
t = fetcher(f"T{tid}", workqueue)
282+
tid += 1
283+
threads.append(t)
284+
t.start()
285+
286+
while downloading or not workqueue.empty():
287+
time.sleep(0.1)
288+
if len(threads) < int(config['threads']):
289+
t = fetcher(f"T{tid}", workqueue)
290+
tid += 1
291+
threads.append(t)
292+
t.start()
293+
pass
294+
295+
# Queue is empty, exit.
296+
exitflag = 1
297+
298+
print(">Download completed.")
299+
if config['waitPackage'] == 'yes':
300+
input('>Press Enter to pack files into epub...')
301+
302+
print(">now packaging epub...")
303+
epub = html2epub.Epub(title, language="zh-cn",
304+
creator="cool18", publisher="cool18")
305+
for file in os.listdir("."):
306+
chap = html2epub.create_chapter_from_file(file)
307+
epub.add_chapter(chap)
308+
epubpath = epub.create_epub(pydir)
309+
print(">OK, epub generated at: %s" % epubpath)
310+
311+
if config['autoDelete'] == 'yes':
312+
os.chdir("..")
313+
print(">Deleting Directory: %s" % title)
314+
shutil.rmtree(title)
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
[network]
2+
#proxy_enabled, set to no if you want to direct connect to cool18.com
3+
enableProxy = no
4+
#proxy address
5+
proxy = http://127.0.0.1:8087
6+
#verify SSL cert
7+
verifyCert = yes
8+
9+
[config]
10+
#min size of content, lesser than this will be ignored.
11+
minContent = 1000
12+
#wait before package to epub [yes|no]
13+
waitPackage = no
14+
#delete temp folder after epub packaged
15+
autoDelete = yes
16+
#download thread count
17+
threads = 10

0 commit comments

Comments
 (0)