部分方法重构，自动识别编码，洗文功能完善

MerrickZ · MerrickZ · commit c3b73d75c936 · 2020-04-17T17:17:24.000+08:00
diff --git a/4.cool18-Article2epub/c2epub.py b/4.cool18-Article2epub/c2epub.py
@@ -7,6 +7,7 @@
 import urllib
 
 import bs4
+import chardet
 import html2epub
 import requests
 
@@ -22,19 +23,19 @@
 }
 
 
-def to_str(bytes_or_str):
-    if isinstance(bytes_or_str, bytes):
-        value = bytes_or_str.decode('utf-8')
-    else:
-        value = bytes_or_str
-    return value
+def find_all(a_str, sub):
+    start = 0
+    while True:
+        start = a_str.find(sub, start)
+        if start == -1:
+            return
+        yield start
+        start += len(sub)
 
 
-def to_bytes(bytes_or_str):
-    if isinstance(bytes_or_str, str):
-        value = bytes_or_str.encode('utf-8')
-    else:
-        value = bytes_or_str
+def to_str(bytes_or_str):
+    codec = chardet.detect(bytes_or_str)
+    value = bytes_or_str.decode(encoding=codec['encoding'])
     return value
 
 
@@ -65,14 +66,26 @@ def fetch(url):
 L_END = '''<a name="postfp">'''
 
 
+def clean_title(_title):
+    title = _title.replace(" - cool18.com", "")
+    title = title.replace("/", "-")
+    title = title.replace("\\", "-")
+    title = title.replace("*", "-")
+    title = title.replace("?", "-")
+    title = title.replace("<", "-")
+    title = title.replace(">", "-")
+    title = title.replace("|", "-")
+    title = title.replace(":", "-").strip()
+    return title
+
+
 def extract_title(content, full=False):
     title_left = content.find('<title>')+len('<title>')
     title_right = content.find('</title>')
     title = content[title_left:title_right]
 
     if (full):
-        title = title.replace(" - cool18.com", "").replace("/",
-                                                           "-").replace("\\", "-").strip()
+        title = clean_title(title)
     else:
         title_search = re.search('[《](.*?)[》]', title, re.IGNORECASE)
         if title_search:
@@ -82,12 +95,18 @@ def extract_title(content, full=False):
             if title_search:
                 title = title_search.group(1)
             else:
-                title = title.replace(
-                    " - cool18.com", "").replace("/", "-").replace("\\", "-").strip()
-
+                title = clean_title(title)
     return title
 
 
+def should_ignore_this_link(_title):
+    iwords = ["银元奖励", "无内容", "版块基金", " 给 ", "幸运红包"]
+    for i in iwords:
+        if i in _title:
+            return True
+    return False
+
+
 def loadConfig():
     cf = configparser.ConfigParser()
     try:
@@ -117,7 +136,6 @@ def download(url):
     print('+%s' % title)
 
     # REMOVE BLANKS
-
     raw = str(src)
 
     try:
@@ -147,7 +165,7 @@ def download(url):
         comm_soup = bs4.BeautifulSoup(comments, "lxml")
         for a in comm_soup.find_all('a'):
             _title = a.getText()
-            if ('银元奖励' in _title) or ('无内容' in _title) or ('版块基金' in _title) or (' 给 ' in _title) or ('幸运红包' in _title):
+            if should_ignore_this_link(_title):
                 continue
             #print('+%s' % _title)
             _u = a.get('href')
@@ -164,15 +182,18 @@ def download(url):
         return
 
     [s.extract() for s in content_soup('script')]
-
+    # Wash Text
     page_content = str(content_soup.find('body').getText())
+    page_content = page_content.replace(" ", "")
+    page_content = page_content.replace("　　", "@@@@@@@@")
     page_content = page_content.replace(os.linesep, "@@@@@@@@")
-    page_content = page_content.replace(
-        'cool18.com', '@@@@@@@@').replace('www.6park.com', '').replace('6park.com', '')
+    page_content = page_content.replace("\n", "@@@@@@@@")
+    page_content = page_content.replace('cool18.com', '@@@@@@@@')
+    page_content = page_content.replace('www.6park.com', '')
+    page_content = page_content.replace('6park.com', '')
     page_content = page_content.replace("@@@@@@@@@@@@@@@@", "</p><p>")
-    page_content = page_content.replace("@@@@@@@@", "")
-    page_content = page_content.replace(" ", "")
-    page_content = page_content.replace("　", "")
+    page_content = page_content.replace("@@@@@@@@", "</p><p>")
+    page_content = page_content.replace("</p><p></p><p>", "</p><p>")
 
     try:
         last_pos = page_content.rindex('评分完成')
@@ -183,8 +204,11 @@ def download(url):
     if (len(page_content.strip()) > int(config['minContent'])):
         try:
             with open("%s-%s.html" % (tid, title), 'w+', encoding='utf-8', errors='ignore') as file:
-                file.write(
-                    r'<?xml version="1.0" encoding="utf-8" standalone="no"?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"><html xmlns="http://www.w3.org/1999/xhtml" xml:lang="zh-CN"><head><title>')
+                file.write("""<?xml version="1.0" encoding="utf-8" standalone="no"?>
+                <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+                <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="zh-CN">
+                <head>
+                <title>""")
                 file.write(title)
                 file.write(r"</title></head><body><p>")
                 file.write(page_content)
@@ -197,10 +221,8 @@ def download(url):
 if __name__ == '__main__':
     args_length = len(sys.argv)
     url = None
-
     if (args_length > 1):
         url = sys.argv[1]
-
     if (not url):
         url = str(input("请粘贴cool18站的文章网址:"))
     loadConfig()
@@ -225,14 +247,14 @@ def download(url):
         if (current_url in downloaded):
             pass
         else:
-            print(r"~[%3d]%s" % (len(hive), current_url))
+            print(r"~[%2d]%s" % (len(hive), current_url))
             download(current_url)
             downloaded.add(current_url)
-
+    print(">Download completed.")
     if config['waitPackage'] == 'yes':
-        input('>Press Enter when ready...')
+        input('>Press Enter to pack files into epub...')
 
-    print(">Download completed, now packaging epub...")
+    print(">now packaging epub...")
     epub = html2epub.Epub(title, language="zh-cn",
                           creator="cool18", publisher="cool18")
     for file in os.listdir("."):