Skip to content

Commit 08385f6

Browse files
author
Germey
committed
catch retry error
1 parent f7cf600 commit 08385f6

File tree

9 files changed

+42
-55
lines changed

9 files changed

+42
-55
lines changed

proxypool/crawlers/base.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from retrying import retry
1+
from retrying import RetryError, retry
22
import requests
33
from loguru import logger
44
from proxypool.setting import GET_TIMEOUT
@@ -23,15 +23,25 @@ def fetch(self, url, **kwargs):
2323
except requests.ConnectionError:
2424
return
2525

26-
@logger.catch
26+
def process(self, html, url):
27+
"""
28+
used for parse html
29+
"""
30+
for proxy in self.parse(html):
31+
logger.info(f'fetched proxy {proxy.string()} from {url}')
32+
yield proxy
33+
2734
def crawl(self):
2835
"""
2936
crawl main method
3037
"""
31-
for url in self.urls:
32-
logger.info(f'fetching {url}')
33-
html = self.fetch(url)
34-
time.sleep(.5)
35-
for proxy in self.parse(html):
36-
logger.info(f'fetched proxy {proxy.string()} from {url}')
37-
yield proxy
38+
try:
39+
for url in self.urls:
40+
logger.info(f'fetching {url}')
41+
html = self.fetch(url)
42+
time.sleep(.5)
43+
yield from self.process(html, url)
44+
except RetryError:
45+
logger.error(
46+
f'crawler {self} crawled proxy unsuccessfully, '
47+
'please check if target url is valid or network issue')

proxypool/crawlers/public/data5u.py

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,23 +11,7 @@ class Data5UCrawler(BaseCrawler):
1111
data5u crawler, http://www.data5u.com
1212
"""
1313
urls = [BASE_URL]
14-
15-
headers = {
16-
'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
17-
}
1814

19-
@logger.catch
20-
def crawl(self):
21-
"""
22-
crawl main method
23-
"""
24-
for url in self.urls:
25-
logger.info(f'fetching {url}')
26-
html = self.fetch(url, headers=self.headers)
27-
for proxy in self.parse(html):
28-
logger.info(f'fetched proxy {proxy.string()} from {url}')
29-
yield proxy
30-
3115
def parse(self, html):
3216
"""
3317
parse html file to get proxies

proxypool/crawlers/public/ihuan.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ class IhuanCrawler(BaseCrawler):
1313
path = time.strftime("%Y/%m/%d/%H", time.localtime())
1414
urls = [BASE_URL.format(path=path)]
1515
ignore = False
16+
1617
def parse(self, html):
1718
"""
1819
parse html file to get proxies

proxypool/crawlers/public/jiangxianli.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,27 @@
11
from proxypool.schemas.proxy import Proxy
22
from proxypool.crawlers.base import BaseCrawler
3-
import re
43
import json
4+
5+
56
BASE_URL = 'https://ip.jiangxianli.com/api/proxy_ips?page={page}'
67

78
MAX_PAGE = 10
9+
10+
811
class JiangxianliCrawler(BaseCrawler):
912
"""
1013
jiangxianli crawler,https://ip.jiangxianli.com/
1114
"""
15+
1216
urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
13-
17+
1418
def parse(self, html):
1519
"""
1620
parse html file to get proxies
1721
:return:
1822
"""
19-
20-
result =json.loads(html)
23+
24+
result = json.loads(html)
2125
if result['code'] != 0:
2226
return
2327
MAX_PAGE = int(result['data']['last_page'])

proxypool/crawlers/public/xiaoshudaili.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
import re
2-
32
from pyquery import PyQuery as pq
4-
53
from proxypool.schemas.proxy import Proxy
64
from proxypool.crawlers.base import BaseCrawler
75

@@ -16,16 +14,23 @@ class XiaoShuCrawler(BaseCrawler):
1614
"""
1715

1816
def __init__(self):
19-
html = self.fetch(url=BASE_URL)
17+
"""
18+
init urls
19+
"""
20+
try:
21+
html = self.fetch(url=BASE_URL)
22+
except:
23+
self.urls = []
24+
return
2025
doc = pq(html)
2126
title = doc(".title:eq(0) a").items()
22-
2327
latest_page = 0
2428
for t in title:
2529
res = re.search(r"/(\d+)\.html", t.attr("href"))
2630
latest_page = int(res.group(1)) if res else 0
2731
if latest_page:
28-
self.urls = [PAGE_BASE_URL.format(page=page) for page in range(latest_page - MAX_PAGE, latest_page)]
32+
self.urls = [PAGE_BASE_URL.format(page=page) for page in range(
33+
latest_page - MAX_PAGE, latest_page)]
2934
else:
3035
self.urls = []
3136

proxypool/crawlers/public/xicidaili.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -12,23 +12,7 @@ class XicidailiCrawler(BaseCrawler):
1212
"""
1313
urls = [BASE_URL]
1414
ignore = True
15-
16-
headers = {
17-
'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
18-
}
1915

20-
@logger.catch
21-
def crawl(self):
22-
"""
23-
crawl main method
24-
"""
25-
for url in self.urls:
26-
logger.info(f'fetching {url}')
27-
html = self.fetch(url, headers=self.headers)
28-
for proxy in self.parse(html):
29-
logger.info(f'fetched proxy {proxy.string()} from {url}')
30-
yield proxy
31-
3216
def parse(self, html):
3317
"""
3418
parse html file to get proxies
@@ -49,4 +33,3 @@ def parse(self, html):
4933
crawler = XicidailiCrawler()
5034
for proxy in crawler.crawl():
5135
print(proxy)
52-

proxypool/crawlers/public/zhandaye.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
BASE_URL = 'https://www.zdaye.com/dayProxy/{page}.html'
99
MAX_PAGE = 5 * 2
1010

11+
1112
class ZhandayeCrawler(BaseCrawler):
1213
"""
1314
zhandaye crawler, https://www.zdaye.com/dayProxy/
@@ -56,4 +57,3 @@ def parse(self, html):
5657
crawler = ZhandayeCrawler()
5758
for proxy in crawler.crawl():
5859
print(proxy)
59-

proxypool/processors/getter.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,22 +8,22 @@ class Getter(object):
88
"""
99
getter of proxypool
1010
"""
11-
11+
1212
def __init__(self):
1313
"""
1414
init db and crawlers
1515
"""
1616
self.redis = RedisClient()
1717
self.crawlers_cls = crawlers_cls
1818
self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls]
19-
19+
2020
def is_full(self):
2121
"""
2222
if proxypool if full
2323
return: bool
2424
"""
2525
return self.redis.count() >= PROXY_NUMBER_MAX
26-
26+
2727
@logger.catch
2828
def run(self):
2929
"""

0 commit comments

Comments
 (0)