|
| 1 | +#!/usr/bin/python3 |
| 2 | +import configparser |
| 3 | +import os |
| 4 | +import queue |
| 5 | +import re |
| 6 | +import shutil |
| 7 | +import sys |
| 8 | +import threading |
| 9 | +import time |
| 10 | +import urllib |
| 11 | + |
| 12 | +import bs4 |
| 13 | +import chardet |
| 14 | +import html2epub |
| 15 | +import requests |
| 16 | + |
| 17 | +# requires: requests bs4 lxml pysocks html2epub |
| 18 | + |
| 19 | +config = { |
| 20 | + "enableProxy": "no", |
| 21 | + "proxy": "socks5://127.0.0.1:1081", |
| 22 | + "minContent": 1000, |
| 23 | + "waitPackage": "no", |
| 24 | + "autoDelete": "yes", |
| 25 | + "verifyCert": "yes", |
| 26 | + "threads": 3 |
| 27 | +} |
| 28 | + |
| 29 | + |
| 30 | +def find_all(a_str, sub): |
| 31 | + start = 0 |
| 32 | + while True: |
| 33 | + start = a_str.find(sub, start) |
| 34 | + if start == -1: |
| 35 | + return |
| 36 | + yield start |
| 37 | + start += len(sub) |
| 38 | + |
| 39 | + |
| 40 | +def to_str(bytes_or_str): |
| 41 | + codec = chardet.detect(bytes_or_str) |
| 42 | + value = bytes_or_str.decode(encoding=codec['encoding']) |
| 43 | + return value |
| 44 | + |
| 45 | + |
| 46 | +def fetch(url): |
| 47 | + |
| 48 | + if config['enableProxy'] == 'yes': |
| 49 | + proxy = config['proxy'] |
| 50 | + proxies = dict(http=proxy, https=proxy) |
| 51 | + try: |
| 52 | + resp = requests.get(url, proxies=proxies, verify=( |
| 53 | + config['verifyCert'] == 'yes')) |
| 54 | + src = to_str(resp.content) |
| 55 | + return src |
| 56 | + finally: |
| 57 | + pass |
| 58 | + else: |
| 59 | + try: |
| 60 | + resp = requests.get(url) |
| 61 | + src = to_str(resp.content) |
| 62 | + return src |
| 63 | + except: |
| 64 | + return "" |
| 65 | + |
| 66 | + |
| 67 | +P_START = "<!--bodybegin-->" |
| 68 | +P_END = "<!--bodyend-->" |
| 69 | +L_START = '''<a name="followups" style=''>''' |
| 70 | +L_END = '''<a name="postfp">''' |
| 71 | + |
| 72 | + |
| 73 | +def clean_title(_title): |
| 74 | + title = _title.replace(" - cool18.com", "") |
| 75 | + title = title.replace("/", "-") |
| 76 | + title = title.replace("\\", "-") |
| 77 | + title = title.replace("*", "-") |
| 78 | + title = title.replace("?", "-") |
| 79 | + title = title.replace("<", "-") |
| 80 | + title = title.replace(">", "-") |
| 81 | + title = title.replace("|", "-") |
| 82 | + title = title.replace(":", "-").strip() |
| 83 | + return title |
| 84 | + |
| 85 | + |
| 86 | +def extract_title(content, full=False): |
| 87 | + title_left = content.find('<title>')+len('<title>') |
| 88 | + title_right = content.find('</title>') |
| 89 | + title = content[title_left:title_right] |
| 90 | + |
| 91 | + if (full): |
| 92 | + title = clean_title(title) |
| 93 | + else: |
| 94 | + title_search = re.search('[《](.*?)[》]', title, re.IGNORECASE) |
| 95 | + if title_search: |
| 96 | + title = title_search.group(1) |
| 97 | + else: |
| 98 | + title_search = re.search('[【](.*?)[】]', title, re.IGNORECASE) |
| 99 | + if title_search: |
| 100 | + title = title_search.group(1) |
| 101 | + else: |
| 102 | + title = clean_title(title) |
| 103 | + return title |
| 104 | + |
| 105 | + |
| 106 | +def should_ignore_this_link(_title): |
| 107 | + iwords = ["银元奖励", "无内容", "版块基金", " 给 ", "幸运红包"] |
| 108 | + for i in iwords: |
| 109 | + if i in _title: |
| 110 | + return True |
| 111 | + return False |
| 112 | + |
| 113 | + |
| 114 | +def loadConfig(): |
| 115 | + cf = configparser.ConfigParser() |
| 116 | + try: |
| 117 | + cf.read('config.ini') |
| 118 | + config['enableProxy'] = cf.get('network', 'enableProxy') |
| 119 | + config['proxy'] = cf.get('network', 'proxy') |
| 120 | + config['minContent'] = cf.get('config', 'minContent') |
| 121 | + config['waitPackage'] = cf.get('config', 'waitPackage') |
| 122 | + config['verifyCert'] = cf.get('network', 'verifyCert') |
| 123 | + config['threads'] = cf.get('config', 'threads') |
| 124 | + requests.packages.urllib3.disable_warnings() |
| 125 | + except: |
| 126 | + pass |
| 127 | + |
| 128 | + |
| 129 | +def download(url,threadname): |
| 130 | + |
| 131 | + uri = urllib.parse.urlparse(url) |
| 132 | + params = urllib.parse.parse_qs(uri.query) |
| 133 | + |
| 134 | + tid = params['tid'] |
| 135 | + if not tid: |
| 136 | + return |
| 137 | + |
| 138 | + src = fetch(url) |
| 139 | + title = extract_title(src, full=True) |
| 140 | + print(f'{threadname}:GOT {title}') |
| 141 | + # REMOVE BLANKS |
| 142 | + raw = str(src) |
| 143 | + |
| 144 | + try: |
| 145 | + pos_start = raw.find(P_START)+len(P_START) |
| 146 | + pos_end = raw.find(P_END) |
| 147 | + page_content = raw[pos_start:pos_end] |
| 148 | + content_soup = bs4.BeautifulSoup(page_content, "lxml") |
| 149 | + # extract in page chapters |
| 150 | + links = content_soup.find_all('a') |
| 151 | + for a in links: |
| 152 | + _title = a.getText() |
| 153 | + #print('+%s' % _title) |
| 154 | + _url = a.get('href') |
| 155 | + if (_url and len(_url.strip()) > 8): |
| 156 | + if config['host'] in _url and not (_url in downloaded): |
| 157 | + workqueue.put(_url) |
| 158 | + a.extract() |
| 159 | + except ValueError: |
| 160 | + return |
| 161 | + |
| 162 | + try: |
| 163 | + # extract below links |
| 164 | + lpos_start = raw.find(L_START)+len(L_START) |
| 165 | + lpos_end = raw.find(L_END) |
| 166 | + comments = raw[lpos_start:lpos_end] |
| 167 | + comm_soup = bs4.BeautifulSoup(comments, "lxml") |
| 168 | + for a in comm_soup.find_all('a'): |
| 169 | + _title = a.getText() |
| 170 | + if should_ignore_this_link(_title): |
| 171 | + continue |
| 172 | + #print('+%s' % _title) |
| 173 | + _u = a.get('href') |
| 174 | + if (_u and _u.startswith("http")): |
| 175 | + if config['host'] in _u and not (_u in downloaded): |
| 176 | + workqueue.put(_u) |
| 177 | + else: |
| 178 | + _u = config['host'] + _u |
| 179 | + if config['host'] in _u and not (_u in downloaded): |
| 180 | + workqueue.put(_u) |
| 181 | + except ValueError: |
| 182 | + pass |
| 183 | + downloading.pop() |
| 184 | + # SKIP DOWNLOADED FILES |
| 185 | + if (os.path.exists("%s-%s.html" % (tid, title))): |
| 186 | + print(f"{threadname}:SKP {tid}-{title}.html" , file=sys.stderr) |
| 187 | + return |
| 188 | + |
| 189 | + [s.extract() for s in content_soup('script')] |
| 190 | + # Wash Text |
| 191 | + page_content = str(content_soup.find('body').getText()) |
| 192 | + page_content = page_content.replace(" ", "") |
| 193 | + page_content = page_content.replace(" ", "@@@@@@@@") |
| 194 | + page_content = page_content.replace(os.linesep, "@@@@@@@@") |
| 195 | + page_content = page_content.replace("\n", "@@@@@@@@") |
| 196 | + page_content = page_content.replace('cool18.com', '@@@@@@@@') |
| 197 | + page_content = page_content.replace('www.6park.com', '') |
| 198 | + page_content = page_content.replace('6park.com', '') |
| 199 | + page_content = page_content.replace("@@@@@@@@@@@@@@@@", "</p><p>") |
| 200 | + page_content = page_content.replace("@@@@@@@@", "</p><p>") |
| 201 | + page_content = page_content.replace("</p><p></p><p>", "</p><p>") |
| 202 | + |
| 203 | + try: |
| 204 | + last_pos = page_content.rindex('评分完成') |
| 205 | + page_content = page_content[:last_pos] |
| 206 | + except ValueError: |
| 207 | + pass |
| 208 | + |
| 209 | + if (len(page_content.strip()) > int(config['minContent'])): |
| 210 | + try: |
| 211 | + with open("%s-%s.html" % (tid, title), 'w+', encoding='utf-8', errors='ignore') as file: |
| 212 | + file.write("""<?xml version="1.0" encoding="utf-8" standalone="no"?> |
| 213 | + <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> |
| 214 | + <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="zh-CN"> |
| 215 | + <head> |
| 216 | + <title>""") |
| 217 | + file.write(title) |
| 218 | + file.write(r"</title></head><body><p>") |
| 219 | + file.write(page_content) |
| 220 | + file.write(r"</p></body></html>") |
| 221 | + except: |
| 222 | + print(f"{threadname}:Error writing {title}", file=sys.stderr) |
| 223 | + else: |
| 224 | + print(f'{threadname}:IGN {title}') |
| 225 | + # add to downloaded |
| 226 | + downloaded.add(url) |
| 227 | + |
| 228 | + |
| 229 | +class fetcher(threading.Thread): |
| 230 | + def __init__(self, name, q): |
| 231 | + threading.Thread.__init__(self) |
| 232 | + self.name = name |
| 233 | + self.q = q |
| 234 | + self.daemon=True |
| 235 | + |
| 236 | + def run(self): |
| 237 | + while not exitflag: |
| 238 | + url = None |
| 239 | + threadlock.acquire() |
| 240 | + if (not workqueue.empty()): |
| 241 | + url = workqueue.get() |
| 242 | + threadlock.release() |
| 243 | + if (url): |
| 244 | + downloading.append(url) |
| 245 | + download(url,self.name) |
| 246 | + |
| 247 | + |
| 248 | +workqueue = queue.Queue() |
| 249 | +threads = [] |
| 250 | +downloaded = set() |
| 251 | +threadlock = threading.Lock() |
| 252 | +exitflag = 0 |
| 253 | +downloading = [] |
| 254 | + |
| 255 | +# Main Logic |
| 256 | +if __name__ == '__main__': |
| 257 | + args_length = len(sys.argv) |
| 258 | + url = None |
| 259 | + if (args_length > 1): |
| 260 | + url = sys.argv[1] |
| 261 | + if (not url): |
| 262 | + url = str(input("请粘贴cool18站的文章网址:")) |
| 263 | + loadConfig() |
| 264 | + pypath = sys.argv[0] |
| 265 | + pydir = os.getcwd() |
| 266 | + |
| 267 | + config['host'] = url[:url.rindex('/')+1] |
| 268 | + |
| 269 | + src = fetch(url) |
| 270 | + title = extract_title(src) |
| 271 | + |
| 272 | + if not os.path.exists(title): |
| 273 | + os.mkdir(title) |
| 274 | + os.chdir(title) |
| 275 | + exitflag = 0 |
| 276 | + |
| 277 | + tid = 0 |
| 278 | + # Init Q |
| 279 | + workqueue.put(url) |
| 280 | + |
| 281 | + t = fetcher(f"T{tid}", workqueue) |
| 282 | + tid += 1 |
| 283 | + threads.append(t) |
| 284 | + t.start() |
| 285 | + |
| 286 | + while downloading or not workqueue.empty(): |
| 287 | + time.sleep(0.1) |
| 288 | + if len(threads) < int(config['threads']): |
| 289 | + t = fetcher(f"T{tid}", workqueue) |
| 290 | + tid += 1 |
| 291 | + threads.append(t) |
| 292 | + t.start() |
| 293 | + pass |
| 294 | + |
| 295 | + # Queue is empty, exit. |
| 296 | + exitflag = 1 |
| 297 | + |
| 298 | + print(">Download completed.") |
| 299 | + if config['waitPackage'] == 'yes': |
| 300 | + input('>Press Enter to pack files into epub...') |
| 301 | + |
| 302 | + print(">now packaging epub...") |
| 303 | + epub = html2epub.Epub(title, language="zh-cn", |
| 304 | + creator="cool18", publisher="cool18") |
| 305 | + for file in os.listdir("."): |
| 306 | + chap = html2epub.create_chapter_from_file(file) |
| 307 | + epub.add_chapter(chap) |
| 308 | + epubpath = epub.create_epub(pydir) |
| 309 | + print(">OK, epub generated at: %s" % epubpath) |
| 310 | + |
| 311 | + if config['autoDelete'] == 'yes': |
| 312 | + os.chdir("..") |
| 313 | + print(">Deleting Directory: %s" % title) |
| 314 | + shutil.rmtree(title) |
0 commit comments