Skip to content
This repository was archived by the owner on Apr 2, 2024. It is now read-only.

Commit 22b87f9

Browse files
committed
Added https support and fixed pdf page issues.
1 parent 8cc2d19 commit 22b87f9

File tree

1 file changed

+32
-13
lines changed

1 file changed

+32
-13
lines changed

comic_scraper/comic_scraper.py

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import re
99
import concurrent.futures
1010
from urllib.parse import urlparse, urljoin
11+
from urllib3.exceptions import InsecureRequestWarning
1112
from zipfile import ZipFile, ZIP_DEFLATED
1213
from random import shuffle, uniform
1314
from time import sleep
@@ -18,7 +19,7 @@
1819
class Comic:
1920
"""Comic class. Contains chapters."""
2021

21-
def __init__(self, comic_url, program_args):
22+
def __init__(self, comic_url, program_args, verify_https):
2223
"""Init function. Creates chapters for the given comic."""
2324
self.url = comic_url
2425
self.name = comic_url.split('/')[-1] \
@@ -34,6 +35,8 @@ def __init__(self, comic_url, program_args):
3435
self.wait_time = program_args.waittime
3536
self.max_retries = program_args.retries
3637
self.file_format = program_args.format
38+
# Set verify mode
39+
self.verify_https = verify_https
3740
# Get all chapters and mode of download
3841
self.all_chapters = self.get_chapters()
3942

@@ -95,7 +98,7 @@ def manga_extract_chapters(self):
9598
urlscheme = urlparse(url)
9699

97100
# Get chapters
98-
r = requests.get(url)
101+
r = requests.get(url, verify=self.verify_https)
99102
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
100103

101104
chapters = defaultdict(Chapter)
@@ -123,7 +126,7 @@ def comic_extract_chapters(self):
123126
"""Extract chapters if it is a comic."""
124127
url = self.url
125128
comic = url.split('/')[-1]
126-
r = requests.get(url)
129+
r = requests.get(url, verify=self.verify_https)
127130
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
128131
volume_num = 1
129132

@@ -161,6 +164,8 @@ def __init__(self, comic, chapter_num, volume_num, chapter_url):
161164
self.wait_time = comic.wait_time
162165
self.max_retries = comic.max_retries
163166
self.comic_file_format = comic.file_format
167+
# Set verify mode
168+
self.verify_https = comic.verify_https
164169

165170
def download_chapter(self):
166171
"""Download and convert it into a cbz file."""
@@ -219,7 +224,7 @@ def manga_get_pages(self):
219224

220225
while True:
221226
# Get javascript blocks
222-
r = requests.get(base_url)
227+
r = requests.get(base_url, verify=self.verify_https)
223228
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
224229
scripts = [script for script in soup.find_all(
225230
'script', attrs={'type': 'text/javascript'})]
@@ -253,7 +258,7 @@ def manga_get_pages(self):
253258
def comic_get_pages(self):
254259
"""Obtain list of pages in a comic chapter."""
255260
url = self.chapter_url
256-
r = requests.get(url)
261+
r = requests.get(url, verify=self.verify_https)
257262
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
258263
images = [image.get('src') for image in soup.find_all(
259264
'img', attrs={'class': "chapter_img"})]
@@ -273,12 +278,12 @@ def manga_download_page(self, page):
273278
wait_retry_time = deepcopy(self.wait_time)
274279

275280
while True:
276-
r = requests.get(page_url)
281+
r = requests.get(page_url, verify=self.verify_https)
277282
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
278283
img = soup.find_all('img', attrs={'id': 'image'})
279284
if img:
280285
image = img[0].get('src')
281-
download_image(image, filename)
286+
download_image(image, filename, self.verify_https)
282287
return True
283288
elif (max_retries > 0):
284289
# Idea from manga_downloader (which in turn was from wget)
@@ -303,9 +308,9 @@ def comic_download_page(self, page):
303308
return True
304309

305310

306-
def download_image(url, filename):
311+
def download_image(url, filename, verify_https):
307312
"""Download image (url) and save (filename)."""
308-
response = requests.get(url, stream=True)
313+
response = requests.get(url, stream=True, verify=verify_https)
309314
with open(filename, 'wb') as out_file:
310315
shutil.copyfileobj(response.raw, out_file)
311316
del response
@@ -317,7 +322,7 @@ def zipdir(folder, filename):
317322
zipf = ZipFile(filename, 'w', ZIP_DEFLATED)
318323
for root, dirs, files in os.walk(folder):
319324
# note: ignore empty directories
320-
for fn in files:
325+
for fn in sorted(files):
321326
zipf.write(
322327
os.path.join(root, fn),
323328
os.path.relpath(os.path.join(root, fn), folder))
@@ -331,7 +336,7 @@ def pdfdir(folder, filename):
331336
for root, dirs, files in os.walk(folder):
332337
# Convert images to pdf
333338
f.write(img2pdf.convert(
334-
[os.path.join(root, fn) for fn in files]))
339+
[os.path.join(root, fn) for fn in sorted(files)]))
335340

336341

337342
def main():
@@ -353,7 +358,7 @@ def main():
353358
"-c", "--chapters", default=False,
354359
help="Specify chapters to download separated by : (10:20).")
355360
parser.add_argument(
356-
"-ct", "--chapterthreads", default=2,
361+
"-ct", "--chapterthreads", default=5,
357362
help="Number of parallel chapters downloads.")
358363
parser.add_argument(
359364
"-pt", "--pagethreads", default=10,
@@ -371,7 +376,21 @@ def main():
371376
args = parser.parse_args()
372377

373378
for url in args.urls:
374-
comic = Comic(url, args)
379+
# If https, check before using verify False
380+
urlscheme = urlparse(url)
381+
verify_https = False
382+
if urlscheme.scheme == 'https':
383+
try:
384+
requests.get(url)
385+
verify_https = True
386+
except requests.exceptions.SSLError:
387+
verify_https = False
388+
print('Could not validate https certificate for url:' +
389+
'%s. Proceeding with Insecure certificate.' % (url))
390+
requests.packages.urllib3.disable_warnings(
391+
category=InsecureRequestWarning)
392+
393+
comic = Comic(url, args, verify_https)
375394
print('Downloading comic: ' + comic.name)
376395

377396
# Get chapters to download

0 commit comments

Comments
 (0)