Skip to content
This repository was archived by the owner on Apr 2, 2024. It is now read-only.

Commit ca436d1

Browse files
author
Kevin Meredith
committed
Added mangareader extractor
1 parent 064ef7e commit ca436d1

File tree

2 files changed

+139
-0
lines changed

2 files changed

+139
-0
lines changed

comic_scraper/current_comic.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from extractors.mangafox import MangaFoxComic
44
from extractors.mangahere import MangaHereComic
55
from extractors.mangastream import MangaStreamComic
6+
from extractors.mangareader import MangaReaderComic
67

78

89
def comic(comic_url, args, verify_https):
@@ -13,3 +14,5 @@ def comic(comic_url, args, verify_https):
1314
return MangaHereComic(comic_url, args, verify_https)
1415
elif ('mangstream' in comic_url) or ('readms' in comic_url):
1516
return MangaStreamComic(comic_url, args, verify_https)
17+
elif ('mangareader' in comic_url):
18+
return MangaReaderComic(comic_url, args, verify_https)
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
"""Extractor for mangastream.com."""
2+
3+
from base_comic import BaseComic, BaseChapter
4+
from urllib.parse import urlparse, urljoin
5+
import requests
6+
import bs4 as bsoup
7+
from collections import defaultdict
8+
import re
9+
import os
10+
import shutil
11+
from random import shuffle, uniform
12+
from copy import deepcopy
13+
from time import sleep
14+
15+
class MangaReaderComic(BaseComic):
16+
"""Base comic class."""
17+
18+
def extract_chapters(self):
19+
"""Extract chapters function (backbone)."""
20+
comic_name = self.name
21+
url = self.url
22+
urlscheme = urlparse(url)
23+
24+
# Get chapters
25+
r = requests.get(url, verify=self.verify_https)
26+
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
27+
28+
chapters = defaultdict(MangaReaderChapter)
29+
links = [link.get('href')
30+
for link in soup.find_all('a')
31+
if link.get('href') and
32+
(comic_name in link.get('href'))]
33+
34+
for link in links:
35+
chapter_link = urljoin(urlscheme.scheme
36+
+ "://" + urlscheme.netloc,
37+
link)
38+
matched_groups = re.search('/([\d \.]+)', chapter_link)
39+
if matched_groups:
40+
chapter_num = float(matched_groups.group(1))
41+
if chapter_num in chapters:
42+
continue
43+
else:
44+
chapters[chapter_num] = MangaReaderChapter(
45+
self, chapter_num, chapter_link)
46+
47+
return chapters
48+
49+
def page_filter(tag):
50+
51+
test = (tag.name == 'option')
52+
test = (test and tag.parent.name == 'select')
53+
test = (test and 'pageMenu' in tag.parent['name'])
54+
55+
return test
56+
57+
class MangaReaderChapter(BaseChapter):
58+
"""Base chapter class."""
59+
60+
61+
62+
def get_pages(self):
63+
"""Obtain list of pages in a manga chapter."""
64+
# Get base url
65+
base_url = self.chapter_url
66+
max_retries = deepcopy(self.max_retries)
67+
wait_retry_time = deepcopy(self.wait_time)
68+
# Obtain match url
69+
urlscheme = urlparse(base_url)
70+
71+
while True:
72+
# Get javascript blocks
73+
r = requests.get(base_url, verify=self.verify_https)
74+
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
75+
76+
page_list = soup.find_all(page_filter)
77+
78+
pages = []
79+
for page in page_list:
80+
curr_url = page.get('value')
81+
try:
82+
page_num = float(curr_url.split('/')[-1])
83+
except:
84+
page_num = 1
85+
page_url = urljoin(urlscheme.scheme
86+
+ "://" + urlscheme.netloc, curr_url)
87+
pages.append((page_url, page_num))
88+
89+
if pages:
90+
shuffle(pages)
91+
return True, pages
92+
93+
elif (max_retries > 0):
94+
# Idea from manga_downloader (which in turn was from wget)
95+
sleep(uniform(0.5 * wait_retry_time, 1.5 * wait_retry_time))
96+
max_retries -= 1
97+
else:
98+
return False, None
99+
100+
def download_page(self, page):
101+
"""Download individual pages in a manga."""
102+
page_url, page_num = page
103+
urlscheme = urlparse(page_url)
104+
filename = os.path.join(self.chapter_location,
105+
'%0.3d.jpg' % (page_num))
106+
107+
max_retries = deepcopy(self.max_retries)
108+
wait_retry_time = deepcopy(self.wait_time)
109+
110+
while True:
111+
r = requests.get(page_url, verify=self.verify_https)
112+
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
113+
for div in soup.find_all('div'):
114+
if div.get('class'):
115+
if div.get('class')[0] == 'page':
116+
img = div.find_all('img')
117+
break
118+
119+
if img:
120+
image = urljoin(urlscheme.scheme
121+
+ "://" + urlscheme.netloc,
122+
img[0].get('src'))
123+
self.download_image(image, filename)
124+
return True
125+
elif (max_retries > 0):
126+
# Idea from manga_downloader (which in turn was from wget)
127+
sleep(uniform(0.5 * wait_retry_time, 1.5 * wait_retry_time))
128+
max_retries -= 1
129+
else:
130+
print("Failed download: Chapter-%g, page-%d"
131+
% (self.chapter_num, page_num))
132+
shutil.copyfile(
133+
os.path.join(os.path.dirname(
134+
os.path.realpath(__file__)), 'no_image_available.png'),
135+
filename)
136+
return False

0 commit comments

Comments
 (0)