1010import urllib
1111
1212import bs4
13- import chardet
1413import html2epub
1514import requests
1615
2322 "waitPackage" : "no" ,
2423 "autoDelete" : "yes" ,
2524 "verifyCert" : "yes" ,
26- "threads" : 3
25+ "threads" : 5
2726}
2827
2928
@@ -38,8 +37,10 @@ def find_all(a_str, sub):
3837
3938
4039def to_str (bytes_or_str ):
41- codec = chardet .detect (bytes_or_str )
42- value = bytes_or_str .decode (encoding = codec ['encoding' ])
40+ try :
41+ value = bytes_or_str .decode (encoding = "UTF-8" )
42+ except :
43+ value = bytes_or_str .decode (encoding = "GBK" )
4344 return value
4445
4546
@@ -126,7 +127,7 @@ def loadConfig():
126127 pass
127128
128129
129- def download (url ,threadname ):
130+ def download (url , threadname ):
130131
131132 uri = urllib .parse .urlparse (url )
132133 params = urllib .parse .parse_qs (uri .query )
@@ -137,7 +138,7 @@ def download(url,threadname):
137138
138139 src = fetch (url )
139140 title = extract_title (src , full = True )
140- print (f'{ threadname } :GOT { title } ' )
141+ print (f'{ threadname } :PROC { title } ' )
141142 # REMOVE BLANKS
142143 raw = str (src )
143144
@@ -183,7 +184,7 @@ def download(url,threadname):
183184 downloading .pop ()
184185 # SKIP DOWNLOADED FILES
185186 if (os .path .exists ("%s-%s.html" % (tid , title ))):
186- print (f"{ threadname } :SKP { tid } -{ title } .html" , file = sys .stderr )
187+ print (f"{ threadname } :SKIP { tid } -{ title } .html" , file = sys .stderr )
187188 return
188189
189190 [s .extract () for s in content_soup ('script' )]
@@ -219,9 +220,9 @@ def download(url,threadname):
219220 file .write (page_content )
220221 file .write (r"</p></body></html>" )
221222 except :
222- print (f"{ threadname } :Error writing { title } " , file = sys .stderr )
223+ print (f"{ threadname } :ERR CAN'T WRITE { title } " , file = sys .stderr )
223224 else :
224- print (f'{ threadname } :IGN { title } ' )
225+ print (f'{ threadname } :PASS { title } ' )
225226 # add to downloaded
226227 downloaded .add (url )
227228
@@ -231,7 +232,7 @@ def __init__(self, name, q):
231232 threading .Thread .__init__ (self )
232233 self .name = name
233234 self .q = q
234- self .daemon = True
235+ self .daemon = True
235236
236237 def run (self ):
237238 while not exitflag :
@@ -242,7 +243,7 @@ def run(self):
242243 threadlock .release ()
243244 if (url ):
244245 downloading .append (url )
245- download (url ,self .name )
246+ download (url , self .name )
246247
247248
248249workqueue = queue .Queue ()
@@ -259,7 +260,7 @@ def run(self):
259260 if (args_length > 1 ):
260261 url = sys .argv [1 ]
261262 if (not url ):
262- url = str (input ("请粘贴cool18站的文章网址:" ))
263+ url = str (input ("# 请粘贴cool18站的文章网址:" ))
263264 loadConfig ()
264265 pypath = sys .argv [0 ]
265266 pydir = os .getcwd ()
0 commit comments