Skip to content
This repository was archived by the owner on Apr 2, 2024. It is now read-only.

Commit 3e7aa66

Browse files
committed
Class architecture from manga-scraper. Working code
1 parent 67ac920 commit 3e7aa66

File tree

1 file changed

+272
-68
lines changed

1 file changed

+272
-68
lines changed

comic_scraper/comic_scraper.py

Lines changed: 272 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,268 @@
1-
#!/usr/bin/env python
1+
#!/usr/bin/env python3
22
import argparse
33
import bs4 as bsoup
44
import requests
55
from collections import defaultdict
66
import shutil
77
import os
8+
import re
89
import concurrent.futures
910
from zipfile import ZipFile, ZIP_DEFLATED
11+
from random import shuffle, uniform
12+
from numpy import arange
13+
from time import sleep
14+
15+
16+
class Comic:
17+
def __init__(self, comic_url, root_dir):
18+
self.url = comic_url
19+
self.name = comic_url.split('/')[-1] \
20+
if comic_url.split('/')[-1] else comic_url.split('/')[-2]
21+
# Set download location
22+
self.download_location = os.path.abspath(
23+
os.path.join(root_dir, self.name))
24+
if not os.path.exists(self.download_location):
25+
os.makedirs(self.download_location)
26+
# Get all chapters and mode of download
27+
self.all_chapters = self.get_chapters()
28+
29+
def get_chapters(self):
30+
if 'mangafox' in self.url:
31+
self.mode = ['manga', 'mangafox']
32+
chapters = self.manga_extract_chapters(self.url)
33+
elif 'mangahere' in self.url:
34+
self.mode = ['manga', 'mangahere']
35+
chapters = self.manga_extract_chapters(self.url)
36+
elif 'readcomics' in self.url:
37+
self.mode = ['comic']
38+
chapters = self.comic_extract_chapters(self.url)
39+
else:
40+
raise ValueError('The scraper currently only supports mangafox, ',
41+
'mangahere and readcomics.tv ',
42+
'%s not supported' % (self.url))
43+
return chapters
44+
45+
def set_download_chapters(self, potential_keys=None):
46+
if potential_keys:
47+
keys = list(set(potential_keys) & set(self.all_chapters.keys()))
48+
else:
49+
keys = list(self.all_chapters.keys())
50+
51+
# Sort keys to make it ascending order and make it a new dict
52+
keys.sort()
53+
self.chapters_to_download = {key: self.all_chapters[key]
54+
for key in keys}
55+
# Print downloading chapters
56+
print("Downloading the below chapters:")
57+
print(keys)
58+
59+
def download_comic(self):
60+
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
61+
future_to_chapter = {
62+
executor.submit(chapter.download_chapter): chapter_num
63+
for chapter_num, chapter in self.chapters_to_download.items()}
64+
65+
for future in concurrent.futures.as_completed(future_to_chapter):
66+
chapter_num = future_to_chapter[future]
67+
try:
68+
future.result()
69+
except Exception as exc:
70+
print('Chapter-%g generated an exception: %s'
71+
% (chapter_num, exc))
72+
else:
73+
print('Downloaded: Chapter-%g' % (chapter_num))
74+
75+
def manga_extract_chapters(self, url):
76+
comic_name = self.name
77+
r = requests.get(url)
78+
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
79+
80+
chapters = defaultdict(defaultdict)
81+
links = [link.get('href')
82+
for link in soup.find_all('a')
83+
if link.get('href') and
84+
(comic_name in link.get('href')) and
85+
('manga' in link.get('href'))]
86+
87+
for link in links:
88+
chapter_link = '/'.join(link.split('/')[:-1])
89+
matched_groups = re.search('v(\d*)/c([\d \.]*)', chapter_link)
90+
if matched_groups:
91+
volume_num = int(matched_groups.group(1))
92+
chapter_num = float(matched_groups.group(2))
93+
if chapter_num in chapters:
94+
continue
95+
else:
96+
chapters[chapter_num] = Chapter(
97+
self, chapter_num, volume_num, chapter_link)
98+
return chapters
99+
100+
def comic_extract_chapters(self, url):
101+
comic = url.split('/')[-1]
102+
r = requests.get(url)
103+
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
104+
volume_num = 1
105+
106+
chapters = defaultdict(defaultdict)
107+
for link in soup.find_all('a'):
108+
if (comic in link.get('href')) and ('chapter' in link.get('href')):
109+
chapter = link.get('href')
110+
chapter_match = re.search('chapter-([\d -]*)', chapter)
111+
chapter_string = chapter_match.group(1)
112+
chapter_num = float('.'.join(chapter_string.split('-')))
113+
if chapter_num in chapters:
114+
continue
115+
else:
116+
chapters[chapter_num] = Chapter(
117+
self, chapter_num, volume_num, chapter + '/full')
118+
119+
return chapters
120+
121+
122+
class Chapter:
123+
def __init__(self, comic, chapter_num, volume_num, chapter_url):
124+
# Extract necessay information from the comic object
125+
self.comic_name = comic.name
126+
self.comic_download_location = comic.download_location
127+
self.comic_mode = comic.mode
128+
# Create chapter specific variables
129+
self.chapter_num = chapter_num
130+
self.volume_num = volume_num
131+
self.chapter_url = chapter_url
132+
133+
def download_chapter(self):
134+
''' Download and convert it into a cbz file '''
135+
init_status, pages, download_func = self.initialize_chapter_download()
136+
137+
if not init_status:
138+
raise RuntimeError('Unable to obtain pages in the chapter')
139+
140+
self.chapter_location = os.path.join(
141+
self.comic_download_location, 'chapter-' + str(self.chapter_num))
142+
if not os.path.exists(self.chapter_location):
143+
os.makedirs(self.chapter_location)
144+
145+
# Download individual pages in parallel
146+
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
147+
executor.map(download_func, pages)
148+
149+
# Convert the folder to a comic book zip filename
150+
if self.comic_mode[0] == 'manga':
151+
chapter_name = os.path.join(
152+
self.comic_download_location, '%s-%g (v%d).cbz'
153+
% (self.comic_name, self.chapter_num, self.volume_num))
154+
elif self.comic_mode[0] == 'comic':
155+
chapter_name = os.path.join(
156+
self.comic_download_location, '%s-%g.cbz'
157+
% (self.comic_name, self.chapter_num))
158+
159+
zipdir(self.chapter_location, chapter_name)
160+
shutil.rmtree(self.chapter_location)
161+
162+
def initialize_chapter_download(self):
163+
''' Obtain pages and function based on the mode '''
164+
if self.comic_mode[0] == 'manga':
165+
init_status, pages = self.manga_get_pages()
166+
func = self.manga_download_page
167+
elif self.comic_mode[0] == 'comic':
168+
init_status, pages = self.comic_get_pages()
169+
func = self.comic_download_page
170+
171+
return init_status, pages, func
172+
173+
def manga_get_pages(self):
174+
# Get base url
175+
if (self.comic_mode[1] == 'mangafox'):
176+
base_url = self.chapter_url + '/1.html'
177+
elif (self.comic_mode[1] == 'mangahere'):
178+
base_url = self.chapter_url
179+
180+
max_retries = 5
181+
wait_retry_time = 5
182+
183+
while True:
184+
# Get javascript blocks
185+
r = requests.get(base_url)
186+
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
187+
scripts = [script for script in soup.find_all(
188+
'script', attrs={'type': 'text/javascript'})]
189+
190+
if scripts:
191+
# Get total pages
192+
for script in scripts:
193+
if script.contents:
194+
matched_groups = re.search(
195+
'var total_pages\s?=\s?(\d*)\s?;',
196+
script.contents[0])
197+
if matched_groups:
198+
total_pages = int(matched_groups.group(1))
199+
break
200+
# Get page urls
201+
page_urls = ["%s/%d.html" % (self.chapter_url, i+1)
202+
for i in range(total_pages)]
203+
page_num = [i+1 for i in range(total_pages)]
204+
pages = list(zip(page_urls, page_num))
205+
shuffle(pages)
206+
207+
return True, pages
208+
209+
elif (max_retries > 0):
210+
# Idea from manga_downloader (which in turn was from wget)
211+
sleep(uniform(0.5*wait_retry_time, 1.5*wait_retry_time))
212+
max_retries -= 1
213+
else:
214+
return False, None
215+
216+
def comic_get_pages(self):
217+
url = self.chapter_url
218+
r = requests.get(url)
219+
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
220+
images = [image.get('src') for image in soup.find_all(
221+
'img', attrs={'class': "chapter_img"})]
222+
page_num = [i+1 for i in range(len(images))]
223+
pages = list(zip(images, page_num))
224+
shuffle(pages)
225+
226+
return True, pages
227+
228+
def manga_download_page(self, page):
229+
''' Downloads individual pages in a manga '''
230+
page_url, page_num = page
231+
filename = os.path.join(self.chapter_location,
232+
'%0.3d.jpg' % (page_num))
233+
234+
max_retries = 5
235+
wait_retry_time = 5
236+
237+
while True:
238+
r = requests.get(page_url)
239+
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
240+
img = soup.find_all('img', attrs={'id': 'image'})
241+
if img:
242+
image = img[0].get('src')
243+
download_image(image, filename)
244+
return True
245+
elif (max_retries > 0):
246+
# Idea from manga_downloader (which in turn was from wget)
247+
sleep(uniform(0.5*wait_retry_time, 1.5*wait_retry_time))
248+
max_retries -= 1
249+
else:
250+
print("Failed download: Chapter-%g, page-%d"
251+
% (self.chapter_num, page_num))
252+
shutil.copyfile(
253+
os.path.join(os.path.dirname(
254+
os.path.realpath(__file__)), 'no_image_available.png'),
255+
filename)
256+
return False
257+
258+
def comic_download_page(self, page):
259+
''' Downloads individual pages in a manga '''
260+
image, page_num = page
261+
filename = os.path.join(self.chapter_location,
262+
'%0.3d.jpg' % (page_num))
263+
264+
download_image(image, filename)
265+
return True
10266

11267

12268
def download_image(url, filename):
@@ -28,58 +284,16 @@ def zipdir(folder, filename):
28284
zipf.close()
29285

30286

31-
def readcomics_extract_chapters(url):
32-
comic = url.split('/')[-1]
33-
r = requests.get(url)
34-
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
35-
36-
chapters = defaultdict(str)
37-
for link in soup.find_all('a'):
38-
if (comic in link.get('href')) and ('chapter' in link.get('href')):
39-
chapter = link.get('href')
40-
chapter_num = int(chapter.split('-')[-1])
41-
if chapter_num in chapters:
42-
continue
43-
else:
44-
chapters[chapter_num] = chapter + '/full'
45-
46-
return chapters
47-
48-
49-
def readcomics_download_chapter(url, chapter_num, download_location):
50-
chapter_name = 'chapter-' + str(chapter_num)
51-
chapter_location = os.path.join(download_location, chapter_name)
52-
r = requests.get(url)
53-
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
54-
images = [image.get('src') for image in soup.find_all(
55-
'img', attrs={'class': "chapter_img"})]
56-
filenames = [
57-
os.path.join(chapter_location, '%0.3d.jpg' % (i))
58-
for i in range(len(images))]
59-
urls = zip(images, filenames)
60-
# Create chapter folder
61-
if not os.path.exists(chapter_location):
62-
os.makedirs(chapter_location)
63-
# Start downloading the urls
64-
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
65-
for image, filename in urls:
66-
executor.submit(download_image, image, filename)
67-
# Convert the folder to a comic book zip filename
68-
zipdir(chapter_location, chapter_location + '.cbz')
69-
shutil.rmtree(chapter_location)
70-
print(chapter_name + ': Downloaded')
71-
72-
73287
def main():
74288
# parse input
75289
parser = argparse.ArgumentParser(
76290
description=(
77-
'Downloads all comics from'
78-
'the given url (currently works only with readcomics.tv). '
79-
' Example - A url input '
80-
' http://www.readcomics.tv/comic/spider-man-2016 looks '
81-
'for the spider-man-2016 comics in the url, downloads them all, '
82-
'and makes cbz files of all issues.'))
291+
'Downloads all manga chapters from'
292+
'the given url (currently works with mangafox.me and mangahere.co'
293+
'). Example - A url input '
294+
' http://mangafox.me/manga/kingdom looks '
295+
'for the kingdom manga chapters in the url, downloads them all, '
296+
'and makes cbz files of all chapters.'))
83297

84298
parser.add_argument('urls', metavar='url', nargs='+',
85299
help='Comic urls to download')
@@ -92,40 +306,30 @@ def main():
92306
args = parser.parse_args()
93307

94308
for url in args.urls:
95-
comic = url.split('/')[-1]
96-
print('Downloading comic: ' + comic)
97-
98-
# Extract chapters
99-
if 'readcomics.tv' in url:
100-
chapters = readcomics_extract_chapters(url)
309+
comic = Comic(url, args.location)
310+
print('Downloading comic: ' + comic.name)
101311

102312
# Get chapters to download
103313
if args.chapters:
104314
try:
105315
start_stop = args.chapters.split(':')
106316
if len(start_stop) == 1:
107-
keys = [int(start_stop)]
317+
potential_keys = [float(start_stop[0])]
108318
elif len(start_stop) == 2:
109-
keys = list(range(
110-
int(start_stop[0]), int(start_stop[1])+1, 1))
319+
potential_keys = list(arange(
320+
float(start_stop[0]), float(start_stop[1])+0.5, 0.5))
111321
else:
112322
raise SyntaxError(
113323
"Chapter inputs should be separated by ':'")
114324
except TypeError:
115325
raise SyntaxError("Chapter inputs should be separated by ':'")
116326
exit()
117-
else:
118-
keys = chapters.keys()
119327

120-
# Download chapters
121-
if 'readcomics.tv' in url:
122-
for k in keys:
123-
download_location = os.path.abspath(
124-
os.path.join(args.location, comic))
125-
if not os.path.exists(download_location):
126-
os.makedirs(download_location)
127-
readcomics_download_chapter(chapters[k], k, download_location)
328+
comic.set_download_chapters(potential_keys)
329+
else:
330+
comic.set_download_chapters()
128331

332+
comic.download_comic()
129333
print('Downloaded comic:' + url.split('/')[-1])
130334

131335

0 commit comments

Comments
 (0)