@@ -74,12 +74,16 @@ def extract_title(content, full=False):
7474 title = title .replace (" - cool18.com" , "" ).replace ("/" ,
7575 "-" ).replace ("\\ " , "-" ).strip ()
7676 else :
77- title_search = re .search ('[【 《](.*)[】 》]' , title , re .IGNORECASE )
77+ title_search = re .search ('[《](.*?)[ 》]' , title , re .IGNORECASE )
7878 if title_search :
7979 title = title_search .group (1 )
8080 else :
81- title = title .replace (
82- " - cool18.com" , "" ).replace ("/" , "-" ).replace ("\\ " , "-" ).strip ()
81+ title_search = re .search ('[【](.*?)[】]' , title , re .IGNORECASE )
82+ if title_search :
83+ title = title_search .group (1 )
84+ else :
85+ title = title .replace (
86+ " - cool18.com" , "" ).replace ("/" , "-" ).replace ("\\ " , "-" ).strip ()
8387
8488 return title
8589
@@ -162,9 +166,14 @@ def download(url):
162166 [s .extract () for s in content_soup ('script' )]
163167
164168 page_content = str (content_soup .find ('body' ).getText ())
165- page_content = page_content .replace (" \n " , "" )
169+ page_content = page_content .replace (os . linesep , "@@@@@@@@ " )
166170 page_content = page_content .replace (
167- 'cool18.com' , '\n ' ).replace ('www.6park.com' , '' ).replace ('6park.com' , '' ).replace ("\n " , "</p><p>" ).replace ("<p></p>" , "" )
171+ 'cool18.com' , '@@@@@@@@' ).replace ('www.6park.com' , '' ).replace ('6park.com' , '' )
172+ page_content = page_content .replace ("@@@@@@@@@@@@@@@@" , "</p><p>" )
173+ page_content = page_content .replace ("@@@@@@@@" , "" )
174+ page_content = page_content .replace (" " , "" )
175+ page_content = page_content .replace (" " , "" )
176+
168177 try :
169178 last_pos = page_content .rindex ('评分完成' )
170179 page_content = page_content [:last_pos ]
0 commit comments