77import urllib
88
99import bs4
10+ import chardet
1011import html2epub
1112import requests
1213
2223}
2324
2425
25- def to_str (bytes_or_str ):
26- if isinstance (bytes_or_str , bytes ):
27- value = bytes_or_str .decode ('utf-8' )
28- else :
29- value = bytes_or_str
30- return value
26+ def find_all (a_str , sub ):
27+ start = 0
28+ while True :
29+ start = a_str .find (sub , start )
30+ if start == - 1 :
31+ return
32+ yield start
33+ start += len (sub )
3134
3235
33- def to_bytes (bytes_or_str ):
34- if isinstance (bytes_or_str , str ):
35- value = bytes_or_str .encode ('utf-8' )
36- else :
37- value = bytes_or_str
36+ def to_str (bytes_or_str ):
37+ codec = chardet .detect (bytes_or_str )
38+ value = bytes_or_str .decode (encoding = codec ['encoding' ])
3839 return value
3940
4041
@@ -65,14 +66,26 @@ def fetch(url):
6566L_END = '''<a name="postfp">'''
6667
6768
69+ def clean_title (_title ):
70+ title = _title .replace (" - cool18.com" , "" )
71+ title = title .replace ("/" , "-" )
72+ title = title .replace ("\\ " , "-" )
73+ title = title .replace ("*" , "-" )
74+ title = title .replace ("?" , "-" )
75+ title = title .replace ("<" , "-" )
76+ title = title .replace (">" , "-" )
77+ title = title .replace ("|" , "-" )
78+ title = title .replace (":" , "-" ).strip ()
79+ return title
80+
81+
6882def extract_title (content , full = False ):
6983 title_left = content .find ('<title>' )+ len ('<title>' )
7084 title_right = content .find ('</title>' )
7185 title = content [title_left :title_right ]
7286
7387 if (full ):
74- title = title .replace (" - cool18.com" , "" ).replace ("/" ,
75- "-" ).replace ("\\ " , "-" ).strip ()
88+ title = clean_title (title )
7689 else :
7790 title_search = re .search ('[《](.*?)[》]' , title , re .IGNORECASE )
7891 if title_search :
@@ -82,12 +95,18 @@ def extract_title(content, full=False):
8295 if title_search :
8396 title = title_search .group (1 )
8497 else :
85- title = title .replace (
86- " - cool18.com" , "" ).replace ("/" , "-" ).replace ("\\ " , "-" ).strip ()
87-
98+ title = clean_title (title )
8899 return title
89100
90101
102+ def should_ignore_this_link (_title ):
103+ iwords = ["银元奖励" , "无内容" , "版块基金" , " 给 " , "幸运红包" ]
104+ for i in iwords :
105+ if i in _title :
106+ return True
107+ return False
108+
109+
91110def loadConfig ():
92111 cf = configparser .ConfigParser ()
93112 try :
@@ -117,7 +136,6 @@ def download(url):
117136 print ('+%s' % title )
118137
119138 # REMOVE BLANKS
120-
121139 raw = str (src )
122140
123141 try :
@@ -147,7 +165,7 @@ def download(url):
147165 comm_soup = bs4 .BeautifulSoup (comments , "lxml" )
148166 for a in comm_soup .find_all ('a' ):
149167 _title = a .getText ()
150- if ( '银元奖励' in _title ) or ( '无内容' in _title ) or ( '版块基金' in _title ) or ( ' 给 ' in _title ) or ( '幸运红包' in _title ):
168+ if should_ignore_this_link ( _title ):
151169 continue
152170 #print('+%s' % _title)
153171 _u = a .get ('href' )
@@ -164,15 +182,18 @@ def download(url):
164182 return
165183
166184 [s .extract () for s in content_soup ('script' )]
167-
185+ # Wash Text
168186 page_content = str (content_soup .find ('body' ).getText ())
187+ page_content = page_content .replace (" " , "" )
188+ page_content = page_content .replace (" " , "@@@@@@@@" )
169189 page_content = page_content .replace (os .linesep , "@@@@@@@@" )
170- page_content = page_content .replace (
171- 'cool18.com' , '@@@@@@@@' ).replace ('www.6park.com' , '' ).replace ('6park.com' , '' )
190+ page_content = page_content .replace ("\n " , "@@@@@@@@" )
191+ page_content = page_content .replace ('cool18.com' , '@@@@@@@@' )
192+ page_content = page_content .replace ('www.6park.com' , '' )
193+ page_content = page_content .replace ('6park.com' , '' )
172194 page_content = page_content .replace ("@@@@@@@@@@@@@@@@" , "</p><p>" )
173- page_content = page_content .replace ("@@@@@@@@" , "" )
174- page_content = page_content .replace (" " , "" )
175- page_content = page_content .replace (" " , "" )
195+ page_content = page_content .replace ("@@@@@@@@" , "</p><p>" )
196+ page_content = page_content .replace ("</p><p></p><p>" , "</p><p>" )
176197
177198 try :
178199 last_pos = page_content .rindex ('评分完成' )
@@ -183,8 +204,11 @@ def download(url):
183204 if (len (page_content .strip ()) > int (config ['minContent' ])):
184205 try :
185206 with open ("%s-%s.html" % (tid , title ), 'w+' , encoding = 'utf-8' , errors = 'ignore' ) as file :
186- file .write (
187- r'<?xml version="1.0" encoding="utf-8" standalone="no"?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"><html xmlns="http://www.w3.org/1999/xhtml" xml:lang="zh-CN"><head><title>' )
207+ file .write ("""<?xml version="1.0" encoding="utf-8" standalone="no"?>
208+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
209+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="zh-CN">
210+ <head>
211+ <title>""" )
188212 file .write (title )
189213 file .write (r"</title></head><body><p>" )
190214 file .write (page_content )
@@ -197,10 +221,8 @@ def download(url):
197221if __name__ == '__main__' :
198222 args_length = len (sys .argv )
199223 url = None
200-
201224 if (args_length > 1 ):
202225 url = sys .argv [1 ]
203-
204226 if (not url ):
205227 url = str (input ("请粘贴cool18站的文章网址:" ))
206228 loadConfig ()
@@ -225,14 +247,14 @@ def download(url):
225247 if (current_url in downloaded ):
226248 pass
227249 else :
228- print (r"~[%3d ]%s" % (len (hive ), current_url ))
250+ print (r"~[%2d ]%s" % (len (hive ), current_url ))
229251 download (current_url )
230252 downloaded .add (current_url )
231-
253+ print ( ">Download completed." )
232254 if config ['waitPackage' ] == 'yes' :
233- input ('>Press Enter when ready ...' )
255+ input ('>Press Enter to pack files into epub ...' )
234256
235- print (">Download completed, now packaging epub..." )
257+ print (">now packaging epub..." )
236258 epub = html2epub .Epub (title , language = "zh-cn" ,
237259 creator = "cool18" , publisher = "cool18" )
238260 for file in os .listdir ("." ):
0 commit comments