Skip to content
This repository was archived by the owner on Apr 2, 2024. It is now read-only.

Commit aaca358

Browse files
committed
Class based rewrite with separate extractors for every site
1 parent bbf213d commit aaca358

File tree

6 files changed

+346
-353
lines changed

6 files changed

+346
-353
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,3 +87,6 @@ ENV/
8787

8888
# Rope project settings
8989
.ropeproject
90+
91+
# Custom
92+
.idea

comic_scraper/base_comic.py

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
"""Base Comic class."""
2+
import os
3+
from collections import OrderedDict
4+
import concurrent.futures
5+
import shutil
6+
import requests
7+
from zipfile import ZipFile, ZIP_DEFLATED
8+
import img2pdf
9+
10+
11+
class BaseComic:
12+
"""Base Comic class. Contains chapters."""
13+
14+
def __init__(self, comic_url, program_args, verify_https):
15+
"""Init function. Creates chapters for the given comic."""
16+
self.url = comic_url
17+
self.name = comic_url.split('/')[-1] \
18+
if comic_url.split('/')[-1] else comic_url.split('/')[-2]
19+
# Set download location
20+
self.download_location = os.path.abspath(
21+
os.path.join(program_args.location, self.name))
22+
if not os.path.exists(self.download_location):
23+
os.makedirs(self.download_location)
24+
# Set threads and retry values
25+
self.chapter_threads = program_args.chapterthreads
26+
self.page_threads = program_args.pagethreads
27+
self.wait_time = program_args.waittime
28+
self.max_retries = program_args.retries
29+
self.file_format = program_args.format
30+
# Set verify mode
31+
self.verify_https = verify_https
32+
# Get all chapters and mode of download
33+
self.all_chapters = self.extract_chapters()
34+
35+
def set_download_chapters(self, potential_keys=None):
36+
"""Set chapters to download."""
37+
if potential_keys:
38+
keys = list(set(potential_keys) & set(self.all_chapters.keys()))
39+
else:
40+
keys = list(self.all_chapters.keys())
41+
42+
# Sort keys to make it ascending order and make it a new dict
43+
unsorted_chapters = {key: self.all_chapters[key]
44+
for key in keys}
45+
self.chapters_to_download = OrderedDict(
46+
sorted(unsorted_chapters.items(), key=lambda t: t[0]))
47+
# Print downloading chapters
48+
print("Downloading the below chapters:")
49+
print(sorted(keys))
50+
51+
def download_comic(self):
52+
"""Begin download the chapters in the comic."""
53+
with concurrent.futures.ThreadPoolExecutor(
54+
max_workers=self.chapter_threads) as executor:
55+
future_to_chapter = {
56+
executor.submit(chapter.download_chapter): chapter_num
57+
for chapter_num, chapter in self.chapters_to_download.items()}
58+
59+
for future in concurrent.futures.as_completed(future_to_chapter):
60+
chapter_num = future_to_chapter[future]
61+
try:
62+
future.result()
63+
except Exception as exc:
64+
print('Chapter-%g generated an exception: %s'
65+
% (chapter_num, exc))
66+
else:
67+
print('Downloaded: Chapter-%g' % (chapter_num))
68+
69+
def extract_chapters(self):
70+
"""Extract chapters function (backbone)."""
71+
pass
72+
73+
74+
class BaseChapter:
75+
"""Base Chapter class. Contains pages."""
76+
77+
def __init__(self, comic, chapter_num, volume_num, chapter_url):
78+
"""Initialize constants required for download."""
79+
# Extract necessary information from the comic object
80+
self.comic_name = comic.name
81+
self.comic_download_location = comic.download_location
82+
# Create chapter specific variables
83+
self.chapter_num = chapter_num
84+
self.volume_num = volume_num
85+
self.chapter_url = chapter_url
86+
# Threads and retry time
87+
self.page_threads = comic.page_threads
88+
self.wait_time = comic.wait_time
89+
self.max_retries = comic.max_retries
90+
self.comic_file_format = comic.file_format
91+
# Set verify mode
92+
self.verify_https = comic.verify_https
93+
# Get download chapter location
94+
self.chapter_location = os.path.join(
95+
self.comic_download_location, 'chapter-' + str(self.chapter_num))
96+
97+
def download_chapter(self):
98+
"""Download and convert it into a cbz file."""
99+
init_status, pages = self.get_pages()
100+
download_func = self.download_page
101+
102+
if not init_status:
103+
raise RuntimeError('Unable to obtain pages in the chapter')
104+
105+
# Create chapter location (if it doesn't exist)
106+
if not os.path.exists(self.chapter_location):
107+
os.makedirs(self.chapter_location)
108+
109+
# Download individual pages in parallel
110+
with concurrent.futures.ThreadPoolExecutor(
111+
max_workers=self.page_threads) as executor:
112+
executor.map(download_func, pages)
113+
114+
# Convert the folder to a comic book zip filename
115+
chapter_name = os.path.join(
116+
self.comic_download_location, '%s-%g (v%d)'
117+
% (self.comic_name, self.chapter_num, self.volume_num))
118+
119+
if self.comic_file_format == 'pdf':
120+
pdfdir(self.chapter_location, chapter_name + ".pdf")
121+
elif self.comic_file_format == 'cbz':
122+
zipdir(self.chapter_location, chapter_name + ".cbz")
123+
shutil.rmtree(self.chapter_location)
124+
125+
def get_pages(self):
126+
"""Get pages function (backbone)."""
127+
return False, 0
128+
129+
def download_page(self):
130+
"""Download page (backbone)."""
131+
pass
132+
133+
def download_image(self, url, filename):
134+
"""Download image (url) and save (filename)."""
135+
response = requests.get(url, stream=True, verify=self.verify_https)
136+
with open(filename, 'wb') as out_file:
137+
shutil.copyfileobj(response.raw, out_file)
138+
del response
139+
140+
141+
def zipdir(folder, filename):
142+
"""Zip folder."""
143+
assert os.path.isdir(folder)
144+
zipf = ZipFile(filename, 'w', ZIP_DEFLATED)
145+
for root, dirs, files in os.walk(folder):
146+
# note: ignore empty directories
147+
for fn in sorted(files):
148+
zipf.write(
149+
os.path.join(root, fn),
150+
os.path.relpath(os.path.join(root, fn), folder))
151+
zipf.close()
152+
153+
154+
def pdfdir(folder, filename):
155+
"""Create PDF of images in the folder."""
156+
assert os.path.isdir(folder)
157+
with open(filename, "wb") as f:
158+
for root, dirs, files in os.walk(folder):
159+
# Convert images to pdf
160+
f.write(img2pdf.convert(
161+
[os.path.join(root, fn) for fn in sorted(files)]))

0 commit comments

Comments
 (0)