1- #!/usr/bin/env python
1+ #!/usr/bin/env python3
22import argparse
33import bs4 as bsoup
44import requests
55from collections import defaultdict
66import shutil
77import os
8+ import re
89import concurrent .futures
910from zipfile import ZipFile , ZIP_DEFLATED
11+ from random import shuffle , uniform
12+ from numpy import arange
13+ from time import sleep
14+
15+
16+ class Comic :
17+ def __init__ (self , comic_url , root_dir ):
18+ self .url = comic_url
19+ self .name = comic_url .split ('/' )[- 1 ] \
20+ if comic_url .split ('/' )[- 1 ] else comic_url .split ('/' )[- 2 ]
21+ # Set download location
22+ self .download_location = os .path .abspath (
23+ os .path .join (root_dir , self .name ))
24+ if not os .path .exists (self .download_location ):
25+ os .makedirs (self .download_location )
26+ # Get all chapters and mode of download
27+ self .all_chapters = self .get_chapters ()
28+
29+ def get_chapters (self ):
30+ if 'mangafox' in self .url :
31+ self .mode = ['manga' , 'mangafox' ]
32+ chapters = self .manga_extract_chapters (self .url )
33+ elif 'mangahere' in self .url :
34+ self .mode = ['manga' , 'mangahere' ]
35+ chapters = self .manga_extract_chapters (self .url )
36+ elif 'readcomics' in self .url :
37+ self .mode = ['comic' ]
38+ chapters = self .comic_extract_chapters (self .url )
39+ else :
40+ raise ValueError ('The scraper currently only supports mangafox, ' ,
41+ 'mangahere and readcomics.tv ' ,
42+ '%s not supported' % (self .url ))
43+ return chapters
44+
45+ def set_download_chapters (self , potential_keys = None ):
46+ if potential_keys :
47+ keys = list (set (potential_keys ) & set (self .all_chapters .keys ()))
48+ else :
49+ keys = list (self .all_chapters .keys ())
50+
51+ # Sort keys to make it ascending order and make it a new dict
52+ keys .sort ()
53+ self .chapters_to_download = {key : self .all_chapters [key ]
54+ for key in keys }
55+ # Print downloading chapters
56+ print ("Downloading the below chapters:" )
57+ print (keys )
58+
59+ def download_comic (self ):
60+ with concurrent .futures .ThreadPoolExecutor (max_workers = 5 ) as executor :
61+ future_to_chapter = {
62+ executor .submit (chapter .download_chapter ): chapter_num
63+ for chapter_num , chapter in self .chapters_to_download .items ()}
64+
65+ for future in concurrent .futures .as_completed (future_to_chapter ):
66+ chapter_num = future_to_chapter [future ]
67+ try :
68+ future .result ()
69+ except Exception as exc :
70+ print ('Chapter-%g generated an exception: %s'
71+ % (chapter_num , exc ))
72+ else :
73+ print ('Downloaded: Chapter-%g' % (chapter_num ))
74+
75+ def manga_extract_chapters (self , url ):
76+ comic_name = self .name
77+ r = requests .get (url )
78+ soup = bsoup .BeautifulSoup (r .text , 'html.parser' )
79+
80+ chapters = defaultdict (defaultdict )
81+ links = [link .get ('href' )
82+ for link in soup .find_all ('a' )
83+ if link .get ('href' ) and
84+ (comic_name in link .get ('href' )) and
85+ ('manga' in link .get ('href' ))]
86+
87+ for link in links :
88+ chapter_link = '/' .join (link .split ('/' )[:- 1 ])
89+ matched_groups = re .search ('v(\d*)/c([\d \.]*)' , chapter_link )
90+ if matched_groups :
91+ volume_num = int (matched_groups .group (1 ))
92+ chapter_num = float (matched_groups .group (2 ))
93+ if chapter_num in chapters :
94+ continue
95+ else :
96+ chapters [chapter_num ] = Chapter (
97+ self , chapter_num , volume_num , chapter_link )
98+ return chapters
99+
100+ def comic_extract_chapters (self , url ):
101+ comic = url .split ('/' )[- 1 ]
102+ r = requests .get (url )
103+ soup = bsoup .BeautifulSoup (r .text , 'html.parser' )
104+ volume_num = 1
105+
106+ chapters = defaultdict (defaultdict )
107+ for link in soup .find_all ('a' ):
108+ if (comic in link .get ('href' )) and ('chapter' in link .get ('href' )):
109+ chapter = link .get ('href' )
110+ chapter_match = re .search ('chapter-([\d -]*)' , chapter )
111+ chapter_string = chapter_match .group (1 )
112+ chapter_num = float ('.' .join (chapter_string .split ('-' )))
113+ if chapter_num in chapters :
114+ continue
115+ else :
116+ chapters [chapter_num ] = Chapter (
117+ self , chapter_num , volume_num , chapter + '/full' )
118+
119+ return chapters
120+
121+
122+ class Chapter :
123+ def __init__ (self , comic , chapter_num , volume_num , chapter_url ):
124+ # Extract necessay information from the comic object
125+ self .comic_name = comic .name
126+ self .comic_download_location = comic .download_location
127+ self .comic_mode = comic .mode
128+ # Create chapter specific variables
129+ self .chapter_num = chapter_num
130+ self .volume_num = volume_num
131+ self .chapter_url = chapter_url
132+
133+ def download_chapter (self ):
134+ ''' Download and convert it into a cbz file '''
135+ init_status , pages , download_func = self .initialize_chapter_download ()
136+
137+ if not init_status :
138+ raise RuntimeError ('Unable to obtain pages in the chapter' )
139+
140+ self .chapter_location = os .path .join (
141+ self .comic_download_location , 'chapter-' + str (self .chapter_num ))
142+ if not os .path .exists (self .chapter_location ):
143+ os .makedirs (self .chapter_location )
144+
145+ # Download individual pages in parallel
146+ with concurrent .futures .ThreadPoolExecutor (max_workers = 20 ) as executor :
147+ executor .map (download_func , pages )
148+
149+ # Convert the folder to a comic book zip filename
150+ if self .comic_mode [0 ] == 'manga' :
151+ chapter_name = os .path .join (
152+ self .comic_download_location , '%s-%g (v%d).cbz'
153+ % (self .comic_name , self .chapter_num , self .volume_num ))
154+ elif self .comic_mode [0 ] == 'comic' :
155+ chapter_name = os .path .join (
156+ self .comic_download_location , '%s-%g.cbz'
157+ % (self .comic_name , self .chapter_num ))
158+
159+ zipdir (self .chapter_location , chapter_name )
160+ shutil .rmtree (self .chapter_location )
161+
162+ def initialize_chapter_download (self ):
163+ ''' Obtain pages and function based on the mode '''
164+ if self .comic_mode [0 ] == 'manga' :
165+ init_status , pages = self .manga_get_pages ()
166+ func = self .manga_download_page
167+ elif self .comic_mode [0 ] == 'comic' :
168+ init_status , pages = self .comic_get_pages ()
169+ func = self .comic_download_page
170+
171+ return init_status , pages , func
172+
173+ def manga_get_pages (self ):
174+ # Get base url
175+ if (self .comic_mode [1 ] == 'mangafox' ):
176+ base_url = self .chapter_url + '/1.html'
177+ elif (self .comic_mode [1 ] == 'mangahere' ):
178+ base_url = self .chapter_url
179+
180+ max_retries = 5
181+ wait_retry_time = 5
182+
183+ while True :
184+ # Get javascript blocks
185+ r = requests .get (base_url )
186+ soup = bsoup .BeautifulSoup (r .text , 'html.parser' )
187+ scripts = [script for script in soup .find_all (
188+ 'script' , attrs = {'type' : 'text/javascript' })]
189+
190+ if scripts :
191+ # Get total pages
192+ for script in scripts :
193+ if script .contents :
194+ matched_groups = re .search (
195+ 'var total_pages\s?=\s?(\d*)\s?;' ,
196+ script .contents [0 ])
197+ if matched_groups :
198+ total_pages = int (matched_groups .group (1 ))
199+ break
200+ # Get page urls
201+ page_urls = ["%s/%d.html" % (self .chapter_url , i + 1 )
202+ for i in range (total_pages )]
203+ page_num = [i + 1 for i in range (total_pages )]
204+ pages = list (zip (page_urls , page_num ))
205+ shuffle (pages )
206+
207+ return True , pages
208+
209+ elif (max_retries > 0 ):
210+ # Idea from manga_downloader (which in turn was from wget)
211+ sleep (uniform (0.5 * wait_retry_time , 1.5 * wait_retry_time ))
212+ max_retries -= 1
213+ else :
214+ return False , None
215+
216+ def comic_get_pages (self ):
217+ url = self .chapter_url
218+ r = requests .get (url )
219+ soup = bsoup .BeautifulSoup (r .text , 'html.parser' )
220+ images = [image .get ('src' ) for image in soup .find_all (
221+ 'img' , attrs = {'class' : "chapter_img" })]
222+ page_num = [i + 1 for i in range (len (images ))]
223+ pages = list (zip (images , page_num ))
224+ shuffle (pages )
225+
226+ return True , pages
227+
228+ def manga_download_page (self , page ):
229+ ''' Downloads individual pages in a manga '''
230+ page_url , page_num = page
231+ filename = os .path .join (self .chapter_location ,
232+ '%0.3d.jpg' % (page_num ))
233+
234+ max_retries = 5
235+ wait_retry_time = 5
236+
237+ while True :
238+ r = requests .get (page_url )
239+ soup = bsoup .BeautifulSoup (r .text , 'html.parser' )
240+ img = soup .find_all ('img' , attrs = {'id' : 'image' })
241+ if img :
242+ image = img [0 ].get ('src' )
243+ download_image (image , filename )
244+ return True
245+ elif (max_retries > 0 ):
246+ # Idea from manga_downloader (which in turn was from wget)
247+ sleep (uniform (0.5 * wait_retry_time , 1.5 * wait_retry_time ))
248+ max_retries -= 1
249+ else :
250+ print ("Failed download: Chapter-%g, page-%d"
251+ % (self .chapter_num , page_num ))
252+ shutil .copyfile (
253+ os .path .join (os .path .dirname (
254+ os .path .realpath (__file__ )), 'no_image_available.png' ),
255+ filename )
256+ return False
257+
258+ def comic_download_page (self , page ):
259+ ''' Downloads individual pages in a manga '''
260+ image , page_num = page
261+ filename = os .path .join (self .chapter_location ,
262+ '%0.3d.jpg' % (page_num ))
263+
264+ download_image (image , filename )
265+ return True
10266
11267
12268def download_image (url , filename ):
@@ -28,58 +284,16 @@ def zipdir(folder, filename):
28284 zipf .close ()
29285
30286
31- def readcomics_extract_chapters (url ):
32- comic = url .split ('/' )[- 1 ]
33- r = requests .get (url )
34- soup = bsoup .BeautifulSoup (r .text , 'html.parser' )
35-
36- chapters = defaultdict (str )
37- for link in soup .find_all ('a' ):
38- if (comic in link .get ('href' )) and ('chapter' in link .get ('href' )):
39- chapter = link .get ('href' )
40- chapter_num = int (chapter .split ('-' )[- 1 ])
41- if chapter_num in chapters :
42- continue
43- else :
44- chapters [chapter_num ] = chapter + '/full'
45-
46- return chapters
47-
48-
49- def readcomics_download_chapter (url , chapter_num , download_location ):
50- chapter_name = 'chapter-' + str (chapter_num )
51- chapter_location = os .path .join (download_location , chapter_name )
52- r = requests .get (url )
53- soup = bsoup .BeautifulSoup (r .text , 'html.parser' )
54- images = [image .get ('src' ) for image in soup .find_all (
55- 'img' , attrs = {'class' : "chapter_img" })]
56- filenames = [
57- os .path .join (chapter_location , '%0.3d.jpg' % (i ))
58- for i in range (len (images ))]
59- urls = zip (images , filenames )
60- # Create chapter folder
61- if not os .path .exists (chapter_location ):
62- os .makedirs (chapter_location )
63- # Start downloading the urls
64- with concurrent .futures .ThreadPoolExecutor (max_workers = 10 ) as executor :
65- for image , filename in urls :
66- executor .submit (download_image , image , filename )
67- # Convert the folder to a comic book zip filename
68- zipdir (chapter_location , chapter_location + '.cbz' )
69- shutil .rmtree (chapter_location )
70- print (chapter_name + ': Downloaded' )
71-
72-
73287def main ():
74288 # parse input
75289 parser = argparse .ArgumentParser (
76290 description = (
77- 'Downloads all comics from'
78- 'the given url (currently works only with readcomics.tv). '
79- ' Example - A url input '
80- ' http://www.readcomics.tv/comic/spider-man-2016 looks '
81- 'for the spider-man-2016 comics in the url, downloads them all, '
82- 'and makes cbz files of all issues .' ))
291+ 'Downloads all manga chapters from'
292+ 'the given url (currently works with mangafox.me and mangahere.co '
293+ '). Example - A url input '
294+ ' http://mangafox.me/manga/kingdom looks '
295+ 'for the kingdom manga chapters in the url, downloads them all, '
296+ 'and makes cbz files of all chapters .' ))
83297
84298 parser .add_argument ('urls' , metavar = 'url' , nargs = '+' ,
85299 help = 'Comic urls to download' )
@@ -92,40 +306,30 @@ def main():
92306 args = parser .parse_args ()
93307
94308 for url in args .urls :
95- comic = url .split ('/' )[- 1 ]
96- print ('Downloading comic: ' + comic )
97-
98- # Extract chapters
99- if 'readcomics.tv' in url :
100- chapters = readcomics_extract_chapters (url )
309+ comic = Comic (url , args .location )
310+ print ('Downloading comic: ' + comic .name )
101311
102312 # Get chapters to download
103313 if args .chapters :
104314 try :
105315 start_stop = args .chapters .split (':' )
106316 if len (start_stop ) == 1 :
107- keys = [int (start_stop )]
317+ potential_keys = [float (start_stop [ 0 ] )]
108318 elif len (start_stop ) == 2 :
109- keys = list (range (
110- int (start_stop [0 ]), int (start_stop [1 ])+ 1 , 1 ))
319+ potential_keys = list (arange (
320+ float (start_stop [0 ]), float (start_stop [1 ])+ 0.5 , 0.5 ))
111321 else :
112322 raise SyntaxError (
113323 "Chapter inputs should be separated by ':'" )
114324 except TypeError :
115325 raise SyntaxError ("Chapter inputs should be separated by ':'" )
116326 exit ()
117- else :
118- keys = chapters .keys ()
119327
120- # Download chapters
121- if 'readcomics.tv' in url :
122- for k in keys :
123- download_location = os .path .abspath (
124- os .path .join (args .location , comic ))
125- if not os .path .exists (download_location ):
126- os .makedirs (download_location )
127- readcomics_download_chapter (chapters [k ], k , download_location )
328+ comic .set_download_chapters (potential_keys )
329+ else :
330+ comic .set_download_chapters ()
128331
332+ comic .download_comic ()
129333 print ('Downloaded comic:' + url .split ('/' )[- 1 ])
130334
131335
0 commit comments