Skip to content
This repository was archived by the owner on Apr 2, 2024. It is now read-only.

Commit 89e7393

Browse files
committed
Added extra arguments for threads and wait time.
1 parent 3703c32 commit 89e7393

File tree

4 files changed

+64
-24
lines changed

4 files changed

+64
-24
lines changed

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
MIT License
22

3-
Copyright (c) 2016 Dinesh Natesan
3+
Copyright (c) 2017 Dinesh Natesan
44

55
Permission is hereby granted, free of charge, to any person obtaining a copy
66
of this software and associated documentation files (the "Software"), to deal

README.md

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
# Comic-scraper (Comic Downloader)
2-
Downloads comics from various websites and creates cbz files from them.
3-
Currently supports just readcomics.tv
1+
# Comic-scraper (Comic/Manga Downloader)
2+
Downloads comics and manga from various websites and creates cbz files from them.
3+
Currently supports readcomics.tv, mangafox.me and mangahere.co
44

55
## Installation
66

@@ -13,7 +13,7 @@ pip install comic-scraper
1313
### Via pip (local)
1414
Clone a copy of the repository using the following command:
1515
```
16-
git clone git@github.com:AbstractGeek/comic-scraper.git
16+
git clone https://github.com/AbstractGeek/comic-scraper.git
1717
```
1818

1919
Open your terminal into the folder and type this into it (sudo might be necessary):
@@ -33,9 +33,10 @@ These can simply be installed by:
3333
```
3434
pip install -r requirements.txt
3535
```
36-
That's it. Use comic_scraper.py to download comics
36+
That's it. Use comic_scraper.py to download comics and manga.
3737

3838
## Usage
39+
### Comics
3940
Find your comic of interest in readcomics.tv. Copy the url of the comic page.
4041
For example, If I wanted to download spider-man-2016, I need to copy this url:
4142
http://www.readcomics.tv/comic/spider-man-2016
@@ -56,3 +57,16 @@ For example, if I want to download chapters 10-20, I use the following command
5657
comic-scraper -l ~/Comics/ -c 10:20 http://www.readcomics.tv/comic/spider-man-2016
5758
```
5859
Note: Only individual chapters or sequential chunks (start:stop) can currently be downloaded.
60+
61+
### Manga
62+
The syntax for downloading manga is exactly the same as the comics. For example, if I wanted to download kindom manga, I need to copy the url from mangafox (or mangahere) website: http://mangafox.me/manga/kingdom/
63+
64+
To download all chapters of this manga, simply call the script and input the url.
65+
```
66+
comic-scraper http://mangafox.me/manga/kingdom/
67+
```
68+
69+
To download selected chapters, add -c and input the chapter numbers. To set a custom location, add -l and input the location. Here is an example:
70+
```
71+
comic-scraper -l ~/Comics/ -c 1:100 http://mangafox.me/manga/kingdom/
72+
```

comic_scraper/comic_scraper.py

Lines changed: 41 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,31 +11,37 @@
1111
from random import shuffle, uniform
1212
from numpy import arange
1313
from time import sleep
14+
from copy import deepcopy
1415

1516

1617
class Comic:
17-
def __init__(self, comic_url, root_dir):
18+
def __init__(self, comic_url, program_args):
1819
self.url = comic_url
1920
self.name = comic_url.split('/')[-1] \
2021
if comic_url.split('/')[-1] else comic_url.split('/')[-2]
2122
# Set download location
2223
self.download_location = os.path.abspath(
23-
os.path.join(root_dir, self.name))
24+
os.path.join(program_args.location, self.name))
2425
if not os.path.exists(self.download_location):
2526
os.makedirs(self.download_location)
27+
# Set threads and retry values
28+
self.chapter_threads = program_args.chapterthreads
29+
self.page_threads = program_args.pagethreads
30+
self.wait_time = program_args.waittime
31+
self.max_retries = program_args.retries
2632
# Get all chapters and mode of download
2733
self.all_chapters = self.get_chapters()
2834

2935
def get_chapters(self):
3036
if 'mangafox' in self.url:
3137
self.mode = ['manga', 'mangafox']
32-
chapters = self.manga_extract_chapters(self.url)
38+
chapters = self.manga_extract_chapters()
3339
elif 'mangahere' in self.url:
3440
self.mode = ['manga', 'mangahere']
35-
chapters = self.manga_extract_chapters(self.url)
41+
chapters = self.manga_extract_chapters()
3642
elif 'readcomics' in self.url:
3743
self.mode = ['comic']
38-
chapters = self.comic_extract_chapters(self.url)
44+
chapters = self.comic_extract_chapters()
3945
else:
4046
raise ValueError('The scraper currently only supports mangafox, ',
4147
'mangahere and readcomics.tv ',
@@ -55,10 +61,11 @@ def set_download_chapters(self, potential_keys=None):
5561
sorted(unsorted_chapters.items(), key=lambda t: t[0]))
5662
# Print downloading chapters
5763
print("Downloading the below chapters:")
58-
print(keys)
64+
print(sorted(keys))
5965

6066
def download_comic(self):
61-
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
67+
with concurrent.futures.ThreadPoolExecutor(
68+
max_workers=self.chapter_threads) as executor:
6269
future_to_chapter = {
6370
executor.submit(chapter.download_chapter): chapter_num
6471
for chapter_num, chapter in self.chapters_to_download.items()}
@@ -73,8 +80,9 @@ def download_comic(self):
7380
else:
7481
print('Downloaded: Chapter-%g' % (chapter_num))
7582

76-
def manga_extract_chapters(self, url):
83+
def manga_extract_chapters(self):
7784
comic_name = self.name
85+
url = self.url
7886
r = requests.get(url)
7987
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
8088

@@ -98,7 +106,8 @@ def manga_extract_chapters(self, url):
98106
self, chapter_num, volume_num, chapter_link)
99107
return chapters
100108

101-
def comic_extract_chapters(self, url):
109+
def comic_extract_chapters(self):
110+
url = self.url
102111
comic = url.split('/')[-1]
103112
r = requests.get(url)
104113
soup = bsoup.BeautifulSoup(r.text, 'html.parser')
@@ -130,6 +139,10 @@ def __init__(self, comic, chapter_num, volume_num, chapter_url):
130139
self.chapter_num = chapter_num
131140
self.volume_num = volume_num
132141
self.chapter_url = chapter_url
142+
# Threads and retry time
143+
self.page_threads = comic.page_threads
144+
self.wait_time = comic.wait_time
145+
self.max_retries = comic.max_retries
133146

134147
def download_chapter(self):
135148
''' Download and convert it into a cbz file '''
@@ -144,7 +157,8 @@ def download_chapter(self):
144157
os.makedirs(self.chapter_location)
145158

146159
# Download individual pages in parallel
147-
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
160+
with concurrent.futures.ThreadPoolExecutor(
161+
max_workers=self.page_threads) as executor:
148162
executor.map(download_func, pages)
149163

150164
# Convert the folder to a comic book zip filename
@@ -178,8 +192,8 @@ def manga_get_pages(self):
178192
elif (self.comic_mode[1] == 'mangahere'):
179193
base_url = self.chapter_url
180194

181-
max_retries = 5
182-
wait_retry_time = 5
195+
max_retries = deepcopy(self.max_retries)
196+
wait_retry_time = deepcopy(self.wait_time)
183197

184198
while True:
185199
# Get javascript blocks
@@ -232,8 +246,8 @@ def manga_download_page(self, page):
232246
filename = os.path.join(self.chapter_location,
233247
'%0.3d.jpg' % (page_num))
234248

235-
max_retries = 10
236-
wait_retry_time = 10
249+
max_retries = deepcopy(self.max_retries)
250+
wait_retry_time = deepcopy(self.wait_time)
237251

238252
while True:
239253
r = requests.get(page_url)
@@ -303,11 +317,23 @@ def main():
303317
parser.add_argument(
304318
"-c", "--chapters", default=False,
305319
help="Specify chapters to download separated by : (10:20).")
320+
parser.add_argument(
321+
"-ct", "--chapterthreads", default=5,
322+
help="Number of parallel chapters downloads.")
323+
parser.add_argument(
324+
"-pt", "--pagethreads", default=10,
325+
help="Number of parallel chapter pages downloads (per chapter).")
326+
parser.add_argument(
327+
"-wt", "--waittime", default=10,
328+
help="Wait time before retry if encountered with an error")
329+
parser.add_argument(
330+
"-rt", "--retries", default=10,
331+
help="Number of retries before giving up")
306332

307333
args = parser.parse_args()
308334

309335
for url in args.urls:
310-
comic = Comic(url, args.location)
336+
comic = Comic(url, args)
311337
print('Downloading comic: ' + comic.name)
312338

313339
# Get chapters to download

setup.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
from setuptools import setup
22

33
setup(name='comic-scraper',
4-
version='0.1',
4+
version='0.5',
55
description='Scraps comics and creates cbz files',
66
url='https://github.com/AbstractGeek/comic-scraper',
7-
download_url='https://github.com/AbstractGeek/comic-scraper/tarball/0.1',
7+
download_url='https://github.com/AbstractGeek/comic-scraper/tarball/0.5',
88
author='Dinesh Natesan',
99
author_email='[email protected]',
1010
license='MIT',
@@ -14,7 +14,7 @@
1414
'Programming Language :: Python :: 3.5',
1515
'Topic :: Games/Entertainment',
1616
],
17-
keywords='comics scraper',
17+
keywords='comics manga scraper',
1818
packages=['comic_scraper'],
1919
install_requires=[
2020
'beautifulsoup4',

0 commit comments

Comments
 (0)