Skip to content
This repository was archived by the owner on Apr 2, 2024. It is now read-only.

Commit ca67424

Browse files
committed
Working mangafox and mangahere extractors
1 parent 3e7ee39 commit ca67424

File tree

4 files changed

+135
-26
lines changed

4 files changed

+135
-26
lines changed

comic_scraper/base_comic.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def set_download_chapters(self, potential_keys=None):
4949
print(sorted(keys))
5050

5151
def download_comic(self):
52-
"""Begin download the chapters in the comic."""
52+
"""Begin download of chapters in the comic."""
5353
with concurrent.futures.ThreadPoolExecutor(
5454
max_workers=self.chapter_threads) as executor:
5555
future_to_chapter = {
@@ -74,14 +74,13 @@ def extract_chapters(self):
7474
class BaseChapter:
7575
"""Base Chapter class. Contains pages."""
7676

77-
def __init__(self, comic, chapter_num, volume_num, chapter_url):
77+
def __init__(self, comic, chapter_num, chapter_url):
7878
"""Initialize constants required for download."""
7979
# Extract necessary information from the comic object
8080
self.comic_name = comic.name
8181
self.comic_download_location = comic.download_location
8282
# Create chapter specific variables
8383
self.chapter_num = chapter_num
84-
self.volume_num = volume_num
8584
self.chapter_url = chapter_url
8685
# Threads and retry time
8786
self.page_threads = comic.page_threads
@@ -113,8 +112,8 @@ def download_chapter(self):
113112

114113
# Convert the folder to a comic book zip filename
115114
chapter_name = os.path.join(
116-
self.comic_download_location, '%s-%g (v%d)'
117-
% (self.comic_name, self.chapter_num, self.volume_num))
115+
self.comic_download_location, '%s-%g'
116+
% (self.comic_name, self.chapter_num))
118117

119118
if self.comic_file_format == 'pdf':
120119
pdfdir(self.chapter_location, chapter_name + ".pdf")

comic_scraper/current_comic.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
"""Define current comic class based on url."""
22

33
from extractors.mangafox import MangaFoxComic
4+
from extractors.mangahere import MangaHereComic
45

56

67
def comic(comic_url, args, verify_https):
78
"""Send the approriate class."""
89
if 'mangafox' in comic_url:
910
return MangaFoxComic(comic_url, args, verify_https)
11+
elif 'mangahere' in comic_url:
12+
return MangaHereComic(comic_url, args, verify_https)

comic_scraper/extractors/mangafox.py

Lines changed: 5 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -34,33 +34,17 @@ def extract_chapters(self):
3434
('manga' in link.get('href'))]
3535

3636
for link in links:
37-
chapter_link = urljoin(urlscheme.scheme + "://" + urlscheme.netloc,
37+
chapter_link = urljoin(urlscheme.scheme
38+
+ "://" + urlscheme.netloc,
3839
'/'.join(link.split('/')[:-1]))
39-
matched_groups = re.search('v(\d*)/c([\d \.]*)', chapter_link)
40+
matched_groups = re.search('c([\d \.]+)', chapter_link)
4041
if matched_groups:
41-
volume_num = int(matched_groups.group(1))
42-
chapter_num = float(matched_groups.group(2))
42+
chapter_num = float(matched_groups.group(1))
4343
if chapter_num in chapters:
4444
continue
4545
else:
4646
chapters[chapter_num] = MangaFoxChapter(
47-
self, chapter_num, volume_num, chapter_link)
48-
49-
if (not chapters) and links:
50-
# Maybe the manga has no volume (try this out)
51-
for link in links:
52-
chapter_link = urljoin(urlscheme.scheme
53-
+ "://" + urlscheme.netloc,
54-
'/'.join(link.split('/')[:-1]))
55-
matched_groups = re.search('c([\d \.]+)', chapter_link)
56-
if matched_groups:
57-
volume_num = 1
58-
chapter_num = float(matched_groups.group(1))
59-
if chapter_num in chapters:
60-
continue
61-
else:
62-
chapters[chapter_num] = MangaFoxChapter(
63-
self, chapter_num, volume_num, chapter_link)
47+
self, chapter_num, chapter_link)
6448

6549
return chapters
6650

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
"""Extractor for mangahere.co."""
2+
3+
from base_comic import BaseComic, BaseChapter
4+
from urllib.parse import urlparse, urljoin
5+
import requests
6+
import bs4 as bsoup
7+
from collections import defaultdict
8+
import re
9+
import os
10+
import shutil
11+
from random import shuffle, uniform
12+
from copy import deepcopy
13+
from time import sleep
14+
15+
16+
class MangaHereComic(BaseComic):
17+
"""Base comic class."""
18+
19+
def extract_chapters(self):
20+
"""Extract chapters function (backbone)."""
21+
comic_name = self.name
22+
url = self.url
23+
urlscheme = urlparse(url)
24+
25+
# Get chapters
26+
r = requests.get(url, verify=self.verify_https)
27+
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
28+
29+
chapters = defaultdict(MangaHereChapter)
30+
links = [link.get('href')
31+
for link in soup.find_all('a')
32+
if link.get('href') and
33+
(comic_name in link.get('href')) and
34+
('manga' in link.get('href'))]
35+
36+
for link in links:
37+
chapter_link = urljoin(urlscheme.scheme
38+
+ "://" + urlscheme.netloc,
39+
'/'.join(link.split('/')[:-1]))
40+
matched_groups = re.search('c([\d \.]+)', chapter_link)
41+
if matched_groups:
42+
chapter_num = float(matched_groups.group(1))
43+
if chapter_num in chapters:
44+
continue
45+
else:
46+
chapters[chapter_num] = MangaHereChapter(
47+
self, chapter_num, chapter_link)
48+
49+
return chapters
50+
51+
52+
class MangaHereChapter(BaseChapter):
53+
"""Base chapter class."""
54+
55+
def get_pages(self):
56+
"""Obtain list of pages in a manga chapter."""
57+
# Get base url
58+
base_url = self.chapter_url
59+
max_retries = deepcopy(self.max_retries)
60+
wait_retry_time = deepcopy(self.wait_time)
61+
62+
while True:
63+
# Get javascript blocks
64+
r = requests.get(base_url, verify=self.verify_https)
65+
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
66+
scripts = [script for script in soup.find_all(
67+
'script', attrs={'type': 'text/javascript'})]
68+
69+
if scripts:
70+
# Get total pages
71+
for script in scripts:
72+
if script.contents:
73+
matched_groups = re.search(
74+
'var total_pages\s?=\s?(\d*)\s?;',
75+
script.contents[0])
76+
if matched_groups:
77+
total_pages = int(matched_groups.group(1))
78+
break
79+
# Get page urls
80+
page_urls = ["%s/%d.html" % (self.chapter_url, i + 1)
81+
for i in range(total_pages)]
82+
page_num = [i + 1 for i in range(total_pages)]
83+
pages = list(zip(page_urls, page_num))
84+
shuffle(pages)
85+
86+
return True, pages
87+
88+
elif (max_retries > 0):
89+
# Idea from manga_downloader (which in turn was from wget)
90+
sleep(uniform(0.5 * wait_retry_time, 1.5 * wait_retry_time))
91+
max_retries -= 1
92+
else:
93+
return False, None
94+
95+
def download_page(self, page):
96+
"""Download individual pages in a manga."""
97+
page_url, page_num = page
98+
filename = os.path.join(self.chapter_location,
99+
'%0.3d.jpg' % (page_num))
100+
101+
max_retries = deepcopy(self.max_retries)
102+
wait_retry_time = deepcopy(self.wait_time)
103+
104+
while True:
105+
r = requests.get(page_url, verify=self.verify_https)
106+
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
107+
img = soup.find_all('img', attrs={'id': 'image'})
108+
if img:
109+
image = img[0].get('src')
110+
self.download_image(image, filename)
111+
return True
112+
elif (max_retries > 0):
113+
# Idea from manga_downloader (which in turn was from wget)
114+
sleep(uniform(0.5 * wait_retry_time, 1.5 * wait_retry_time))
115+
max_retries -= 1
116+
else:
117+
print("Failed download: Chapter-%g, page-%d"
118+
% (self.chapter_num, page_num))
119+
shutil.copyfile(
120+
os.path.join(os.path.dirname(
121+
os.path.realpath(__file__)), 'no_image_available.png'),
122+
filename)
123+
return False

0 commit comments

Comments
 (0)