Skip to content

Commit 6b2461f

Browse files
author
Alexandru Meterez
committed
Move project to using pupeteer instead of chromedriver
1 parent b759230 commit 6b2461f

File tree

3 files changed

+30
-30
lines changed

3 files changed

+30
-30
lines changed

fred/config/status_codes.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,4 @@
33
2: 'Prediction script failed',
44
3: 'Failed to launch inference process',
55
4: 'Model does not exist',
6-
5: 'CHROMEDRIVER_PATH not provided in env variables'
76
}

fred/crawler/crawler.py

Lines changed: 28 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,33 @@
11
from urllib.parse import urlparse
2-
from selenium import webdriver
32
from bs4 import BeautifulSoup
4-
from selenium.common.exceptions import InvalidArgumentException
5-
import os
63
from utils.utils import eprint
4+
from pyppeteer import launch
5+
import asyncio
76

87

9-
def _recursive_get_urls(crawled_urls, driver, max_urls, parent_url, domain, depth=0):
8+
class MyPage(object):
9+
def __init__(self):
10+
self.source = None
11+
12+
def set(self, source):
13+
self.source = source
14+
15+
16+
async def get_page(test_page, url):
17+
browser = await launch()
18+
page = await browser.newPage()
19+
await page.goto(url)
20+
out = await page.content()
21+
test_page.set(out)
22+
await browser.close()
23+
24+
25+
def _recursive_get_urls(crawled_urls, test_page, max_urls, parent_url, domain, depth=0):
1026
if depth == 0 or len(crawled_urls) == max_urls:
1127
return crawled_urls
12-
driver.get(parent_url)
13-
html = driver.page_source.encode('utf-8')
28+
asyncio.get_event_loop().run_until_complete(get_page(test_page, parent_url))
29+
30+
html = test_page.source
1431
soup = BeautifulSoup(html, features='html.parser')
1532

1633
urls = soup.findAll('a')
@@ -23,31 +40,15 @@ def _recursive_get_urls(crawled_urls, driver, max_urls, parent_url, domain, dept
2340
if urlparse(url).netloc == domain and url not in crawled_urls:
2441
if len(crawled_urls) <= max_urls:
2542
crawled_urls.append(url)
26-
print('[LOG] Added: {}'.format(url))
27-
_recursive_get_urls(crawled_urls, driver, max_urls, url, domain, depth - 1)
43+
eprint('[LOG] Added: {}'.format(url))
44+
_recursive_get_urls(crawled_urls, max_urls, url, domain, depth - 1)
2845

2946

3047
def get_recursive_urls(parent_url, max_depth, max_urls):
3148
scraped_urls = [parent_url]
3249
domain = urlparse(parent_url).netloc
33-
if not 'CHROMEDRIVER_PATH' in os.environ:
34-
eprint('[ERR] CHROMEDRIVER_PATH not provided in env variables')
35-
exit(5)
36-
driver_path = os.environ['CHROMEDRIVER_PATH']
37-
assert os.path.exists(driver_path), 'No such file {}'.format(driver_path)
38-
options = webdriver.ChromeOptions()
39-
options.add_argument("--disable-infobars")
40-
options.add_argument("--headless")
41-
options.add_argument("--no-sandbox")
42-
options.add_argument("--disable-dev-shm-usage")
43-
driver = webdriver.Chrome(driver_path, chrome_options=options)
44-
try:
45-
driver.get(parent_url)
46-
except InvalidArgumentException:
47-
eprint('[ERR] Invalid website')
48-
driver.close()
49-
exit(1)
50-
_recursive_get_urls(scraped_urls, driver, max_urls, parent_url, domain, depth=max_depth)
50+
page = MyPage()
51+
asyncio.get_event_loop().run_until_complete(get_page(page, parent_url))
52+
_recursive_get_urls(scraped_urls, page, max_urls, parent_url, domain, depth=max_depth)
5153
eprint('[LOG] Finished crawling URLs for {}'.format(parent_url))
52-
driver.close()
5354
return scraped_urls

requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
selenium==3.141.0
21
torchsummary==1.5.1
32
torchvision==0.4.0
43
APScheduler==3.6.1
@@ -9,4 +8,5 @@ Flask_Cors==3.0.8
98
numpy==1.17.1
109
Pillow==6.1.0
1110
browsermob_proxy==0.8.0
12-
beautifulsoup4==4.8.0
11+
beautifulsoup4==4.8.0
12+
pyppeteer==0.0.25

0 commit comments

Comments
 (0)