Skip to content
This repository was archived by the owner on Apr 2, 2024. It is now read-only.

Commit 6422233

Browse files
committed
Barebones comic scrapper
1 parent 7453a85 commit 6422233

File tree

1 file changed

+92
-0
lines changed

1 file changed

+92
-0
lines changed

comic-scrapper.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#!/usr/bin/env python3
2+
import argparse
3+
import bs4 as bsoup
4+
import requests
5+
from collections import defaultdict
6+
# from pprint import pprint
7+
import shutil
8+
import os
9+
import concurrent.futures
10+
import subprocess
11+
12+
13+
def download_image(url, filename):
14+
response = requests.get(url, stream=True)
15+
with open(filename, 'wb') as out_file:
16+
shutil.copyfileobj(response.raw, out_file)
17+
del response
18+
19+
20+
def readcomics_extract_chapters(url):
21+
comic = url.split('/')[-1]
22+
r = requests.get(url)
23+
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
24+
25+
chapters = defaultdict(str)
26+
for link in soup.find_all('a'):
27+
if (comic in link.get('href')) and ('chapter' in link.get('href')):
28+
chapter = link.get('href')
29+
chapter_num = int(chapter.split('-')[-1])
30+
if chapter_num in chapters:
31+
continue
32+
else:
33+
chapters[chapter_num] = chapter + '/full'
34+
35+
return chapters
36+
37+
38+
def readcomics_download_chapter(url, chapter_num):
39+
chapter_name = 'chapter-' + str(chapter_num)
40+
r = requests.get(url)
41+
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
42+
images = [image.get('src') for image in soup.find_all(
43+
'img', attrs={'class': "chapter_img"})]
44+
filenames = [
45+
os.path.join(chapter_name, '%0.3d.jpg' % (i))
46+
for i in range(len(images))]
47+
urls = zip(images, filenames)
48+
# Create chapter folder
49+
if not os.path.exists(chapter_name):
50+
os.makedirs(chapter_name)
51+
# Start downloading the urls
52+
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
53+
for image, filename in urls:
54+
executor.submit(download_image, image, filename)
55+
# Convert the folder to a comic book zip filename
56+
subprocess.check_output(
57+
['zip', '-r', chapter_name + '.cbz', chapter_name],
58+
stderr=subprocess.STDOUT)
59+
shutil.rmtree(chapter_name)
60+
print(chapter_name + ': Downloaded')
61+
62+
63+
def main():
64+
# parse input
65+
parser = argparse.ArgumentParser(
66+
description=(
67+
'Downloads all comics from'
68+
'the given url (currently works only with readcomics.tv).'
69+
' Example - A url input'
70+
' http://www.readcomics.tv/comic/spider-man-2016 looks '
71+
'for the spider-man-2016 comics in the url, downloads them all, '
72+
'and makes cbz files of all issues.'))
73+
74+
parser.add_argument('urls', metavar='url', nargs='+',
75+
help='Comic urls to download')
76+
77+
args = parser.parse_args()
78+
79+
for url in args.urls:
80+
print('Downloading comic:' + url.split('/')[-1])
81+
if 'readcomics.tv' in url:
82+
chapters = readcomics_extract_chapters(url)
83+
84+
if 'readcomics.tv' in url:
85+
for k in chapters:
86+
readcomics_download_chapter(chapters[k], k)
87+
88+
print('Downloaded comic:' + url.split('/')[-1])
89+
90+
91+
if __name__ == '__main__':
92+
main()

0 commit comments

Comments
 (0)