Skip to content
This repository was archived by the owner on Apr 2, 2024. It is now read-only.

Commit 181f242

Browse files
committed
Fixed mangafox download. Automatically completes url with http(s).
1 parent 7450630 commit 181f242

File tree

2 files changed

+61
-35
lines changed

2 files changed

+61
-35
lines changed

comic_scraper/comic_scraper.py

Lines changed: 54 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,21 @@
77
import os
88
import re
99
import concurrent.futures
10+
from urllib.parse import urlparse, urljoin
1011
from zipfile import ZipFile, ZIP_DEFLATED
1112
from random import shuffle, uniform
1213
from time import sleep
1314
from copy import deepcopy
14-
from fpdf import FPDF
15-
from PIL import Image
16-
from PyPDF2 import PdfFileMerger
15+
#from fpdf import FPDF
16+
#from PIL import Image
17+
#from PyPDF2 import PdfFileMerger
1718

1819

1920
class Comic:
21+
"""Comic class. Contains chapters."""
22+
2023
def __init__(self, comic_url, program_args):
24+
"""Init function. Creates chapters for the given comic."""
2125
self.url = comic_url
2226
self.name = comic_url.split('/')[-1] \
2327
if comic_url.split('/')[-1] else comic_url.split('/')[-2]
@@ -36,6 +40,7 @@ def __init__(self, comic_url, program_args):
3640
self.all_chapters = self.get_chapters()
3741

3842
def get_chapters(self):
43+
"""Get list of chapters."""
3944
if 'mangafox' in self.url:
4045
self.mode = ['manga', 'mangafox']
4146
chapters = self.manga_extract_chapters()
@@ -52,6 +57,7 @@ def get_chapters(self):
5257
return chapters
5358

5459
def set_download_chapters(self, potential_keys=None):
60+
"""Set chapters to download."""
5561
if potential_keys:
5662
keys = list(set(potential_keys) & set(self.all_chapters.keys()))
5763
else:
@@ -67,6 +73,7 @@ def set_download_chapters(self, potential_keys=None):
6773
print(sorted(keys))
6874

6975
def download_comic(self):
76+
"""Begin download the chapters in the comic."""
7077
with concurrent.futures.ThreadPoolExecutor(
7178
max_workers=self.chapter_threads) as executor:
7279
future_to_chapter = {
@@ -84,8 +91,12 @@ def download_comic(self):
8491
print('Downloaded: Chapter-%g' % (chapter_num))
8592

8693
def manga_extract_chapters(self):
94+
"""Extract chapters if the comic is a manga."""
8795
comic_name = self.name
8896
url = self.url
97+
urlscheme = urlparse(url)
98+
99+
# Get chapters
89100
r = requests.get(url)
90101
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
91102

@@ -97,7 +108,8 @@ def manga_extract_chapters(self):
97108
('manga' in link.get('href'))]
98109

99110
for link in links:
100-
chapter_link = '/'.join(link.split('/')[:-1])
111+
chapter_link = urljoin(urlscheme.scheme + "://" + urlscheme.netloc,
112+
'/'.join(link.split('/')[:-1]))
101113
matched_groups = re.search('v(\d*)/c([\d \.]*)', chapter_link)
102114
if matched_groups:
103115
volume_num = int(matched_groups.group(1))
@@ -110,6 +122,7 @@ def manga_extract_chapters(self):
110122
return chapters
111123

112124
def comic_extract_chapters(self):
125+
"""Extract chapters if it is a comic."""
113126
url = self.url
114127
comic = url.split('/')[-1]
115128
r = requests.get(url)
@@ -133,8 +146,11 @@ def comic_extract_chapters(self):
133146

134147

135148
class Chapter:
149+
"""Chapter class. Contains pages."""
150+
136151
def __init__(self, comic, chapter_num, volume_num, chapter_url):
137-
# Extract necessay information from the comic object
152+
"""Initialize constants required for download."""
153+
# Extract necessary information from the comic object
138154
self.comic_name = comic.name
139155
self.comic_download_location = comic.download_location
140156
self.comic_mode = comic.mode
@@ -149,7 +165,7 @@ def __init__(self, comic, chapter_num, volume_num, chapter_url):
149165
self.comic_file_format = comic.file_format
150166

151167
def download_chapter(self):
152-
''' Download and convert it into a cbz file '''
168+
"""Download and convert it into a cbz file."""
153169
init_status, pages, download_func = self.initialize_chapter_download()
154170

155171
if not init_status:
@@ -174,15 +190,15 @@ def download_chapter(self):
174190
chapter_name = os.path.join(
175191
self.comic_download_location, '%s-%g.cbz'
176192
% (self.comic_name, self.chapter_num))
177-
193+
178194
if self.comic_file_format == 'pdf':
179195
pdfdir(self.chapter_location, chapter_name)
180196
else:
181197
zipdir(self.chapter_location, chapter_name)
182198
shutil.rmtree(self.chapter_location)
183199

184200
def initialize_chapter_download(self):
185-
''' Obtain pages and function based on the mode '''
201+
"""Obtain pages and function based on the mode."""
186202
if self.comic_mode[0] == 'manga':
187203
init_status, pages = self.manga_get_pages()
188204
func = self.manga_download_page
@@ -193,6 +209,7 @@ def initialize_chapter_download(self):
193209
return init_status, pages, func
194210

195211
def manga_get_pages(self):
212+
"""Obtain list of pages in a manga chapter."""
196213
# Get base url
197214
if (self.comic_mode[1] == 'mangafox'):
198215
base_url = self.chapter_url + '/1.html'
@@ -220,35 +237,36 @@ def manga_get_pages(self):
220237
total_pages = int(matched_groups.group(1))
221238
break
222239
# Get page urls
223-
page_urls = ["%s/%d.html" % (self.chapter_url, i+1)
240+
page_urls = ["%s/%d.html" % (self.chapter_url, i + 1)
224241
for i in range(total_pages)]
225-
page_num = [i+1 for i in range(total_pages)]
242+
page_num = [i + 1 for i in range(total_pages)]
226243
pages = list(zip(page_urls, page_num))
227244
shuffle(pages)
228245

229246
return True, pages
230247

231248
elif (max_retries > 0):
232249
# Idea from manga_downloader (which in turn was from wget)
233-
sleep(uniform(0.5*wait_retry_time, 1.5*wait_retry_time))
250+
sleep(uniform(0.5 * wait_retry_time, 1.5 * wait_retry_time))
234251
max_retries -= 1
235252
else:
236253
return False, None
237254

238255
def comic_get_pages(self):
256+
"""Obtain list of pages in a comic chapter."""
239257
url = self.chapter_url
240258
r = requests.get(url)
241259
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
242260
images = [image.get('src') for image in soup.find_all(
243261
'img', attrs={'class': "chapter_img"})]
244-
page_num = [i+1 for i in range(len(images))]
262+
page_num = [i + 1 for i in range(len(images))]
245263
pages = list(zip(images, page_num))
246264
shuffle(pages)
247265

248266
return True, pages
249267

250268
def manga_download_page(self, page):
251-
''' Downloads individual pages in a manga '''
269+
"""Download individual pages in a manga."""
252270
page_url, page_num = page
253271
filename = os.path.join(self.chapter_location,
254272
'%0.3d.jpg' % (page_num))
@@ -266,7 +284,7 @@ def manga_download_page(self, page):
266284
return True
267285
elif (max_retries > 0):
268286
# Idea from manga_downloader (which in turn was from wget)
269-
sleep(uniform(0.5*wait_retry_time, 1.5*wait_retry_time))
287+
sleep(uniform(0.5 * wait_retry_time, 1.5 * wait_retry_time))
270288
max_retries -= 1
271289
else:
272290
print("Failed download: Chapter-%g, page-%d"
@@ -278,7 +296,7 @@ def manga_download_page(self, page):
278296
return False
279297

280298
def comic_download_page(self, page):
281-
''' Downloads individual pages in a manga '''
299+
"""Download individual pages in a comic."""
282300
image, page_num = page
283301
filename = os.path.join(self.chapter_location,
284302
'%0.3d.jpg' % (page_num))
@@ -288,13 +306,15 @@ def comic_download_page(self, page):
288306

289307

290308
def download_image(url, filename):
309+
"""Download image (url) and save (filename)."""
291310
response = requests.get(url, stream=True)
292311
with open(filename, 'wb') as out_file:
293312
shutil.copyfileobj(response.raw, out_file)
294313
del response
295314

296315

297316
def zipdir(folder, filename):
317+
"""Zip folder."""
298318
assert os.path.isdir(folder)
299319
zipf = ZipFile(filename, 'w', ZIP_DEFLATED)
300320
for root, dirs, files in os.walk(folder):
@@ -305,42 +325,44 @@ def zipdir(folder, filename):
305325
os.path.relpath(os.path.join(root, fn), folder))
306326
zipf.close()
307327

328+
308329
def pdfdir(folder, filename):
330+
"""Create PDF of images in the folder."""
309331
assert os.path.isdir(folder)
310332
for root, dirs, files in os.walk(folder):
311333
pass
312-
334+
313335
for fn in files:
314-
im=Image.open(folder + os.sep + fn)
336+
im = Image.open(folder + os.sep + fn)
315337
width, height = im.size
316-
pdf = FPDF(unit = "pt", format = [width, height])
338+
pdf = FPDF(unit="pt", format=[width, height])
317339
pdf.add_page()
318340
pdf.image(folder + os.sep + fn, 0, 0)
319341
pdf.output(folder + os.sep + fn.rsplit('.', 1)[0] + '.pdf', 'F')
320342

321343
merger = PdfFileMerger()
322-
for fn in files:
323-
merger.append(open(folder + os.sep + fn.rsplit('.', 1)[0] + '.pdf', 'rb'))
324-
325-
merge_file = open(filename.rsplit('.', 1)[0] + '.pdf','wb')
344+
for fn in files:
345+
merger.append(
346+
open(folder + os.sep + fn.rsplit('.', 1)[0] + '.pdf', 'rb'))
347+
348+
merge_file = open(filename.rsplit('.', 1)[0] + '.pdf', 'wb')
326349
merger.write(merge_file)
327-
328-
350+
329351

330352
# cover = Image.open(folder + os.sep + fn)
331353
# width, height = cover.size
332354
# pdf = FPDF(unit = "pt", format = [width, height])
333355
# pdf.add_page()
334356
# pdf.image(folder + os.sep + fn, 0, 0)
335357
# pdf.output(folder + os.sep + fn.rsplit('.', 1)[0] + '.pdf', 'F')
336-
#
358+
#
337359
# merger = PdfFileMerger()
338360
# for fn in files:
339361
# merger.append(open(folder + os.sep + fn.rsplit('.', 1)[0] + '.pdf', 'rb'))
340362
# merger.write(filename.rsplit('.', 1)[0] + '.pdf')
341363

342364
def main():
343-
# parse input
365+
"""Parse input and download comic(s)."""
344366
parser = argparse.ArgumentParser(
345367
description=(
346368
'Downloads all manga chapters from'
@@ -367,11 +389,11 @@ def main():
367389
"-wt", "--waittime", default=10,
368390
help="Wait time before retry if encountered with an error")
369391
parser.add_argument(
370-
"-rt", "--retries", default=10,
392+
"-rt", "--retries", default=30,
371393
help="Number of retries before giving up")
372394
parser.add_argument(
373395
"-f", "--format", default='cbz',
374-
help="File format of the downloaded file, supported .PDF and .CBZ")
396+
help="File format of the downloaded file, supported 'pdf' and 'cbz'")
375397

376398
args = parser.parse_args()
377399

@@ -386,7 +408,9 @@ def main():
386408
if len(start_stop) == 1:
387409
potential_keys = [float(start_stop[0])]
388410
elif len(start_stop) == 2:
389-
potential_keys = [i*0.5 for i in range(2*int(start_stop[0]), 2*int(start_stop[1])+1)]
411+
potential_keys = [
412+
i * 0.5 for i in range(2 * int(start_stop[0]),
413+
2 * int(start_stop[1]) + 1)]
390414
else:
391415
raise SyntaxError(
392416
"Chapter inputs should be separated by ':'")
@@ -401,7 +425,6 @@ def main():
401425
comic.download_comic()
402426
print('Downloaded comic:' + url.split('/')[-1])
403427

404-
405-
428+
406429
if __name__ == '__main__':
407430
main()

requirements.txt

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
beautifulsoup4
2-
futures
3-
requests
4-
numpy
1+
beautifulsoup4==4.6.0
2+
certifi==2017.7.27.1
3+
chardet==3.0.4
4+
futures==3.1.1
5+
idna==2.6
6+
requests==2.18.4
7+
urllib3==1.22

0 commit comments

Comments
 (0)