11from urllib .parse import urlparse
2- from selenium import webdriver
32from bs4 import BeautifulSoup
4- from selenium .common .exceptions import InvalidArgumentException
5- import os
63from utils .utils import eprint
4+ from pyppeteer import launch
5+ import asyncio
76
87
9- def _recursive_get_urls (crawled_urls , driver , max_urls , parent_url , domain , depth = 0 ):
8+ class MyPage (object ):
9+ def __init__ (self ):
10+ self .source = None
11+
12+ def set (self , source ):
13+ self .source = source
14+
15+
16+ async def get_page (test_page , url ):
17+ browser = await launch ()
18+ page = await browser .newPage ()
19+ await page .goto (url )
20+ out = await page .content ()
21+ test_page .set (out )
22+ await browser .close ()
23+
24+
25+ def _recursive_get_urls (crawled_urls , test_page , max_urls , parent_url , domain , depth = 0 ):
1026 if depth == 0 or len (crawled_urls ) == max_urls :
1127 return crawled_urls
12- driver .get (parent_url )
13- html = driver .page_source .encode ('utf-8' )
28+ asyncio .get_event_loop ().run_until_complete (get_page (test_page , parent_url ))
29+
30+ html = test_page .source
1431 soup = BeautifulSoup (html , features = 'html.parser' )
1532
1633 urls = soup .findAll ('a' )
@@ -23,31 +40,15 @@ def _recursive_get_urls(crawled_urls, driver, max_urls, parent_url, domain, dept
2340 if urlparse (url ).netloc == domain and url not in crawled_urls :
2441 if len (crawled_urls ) <= max_urls :
2542 crawled_urls .append (url )
26- print ('[LOG] Added: {}' .format (url ))
27- _recursive_get_urls (crawled_urls , driver , max_urls , url , domain , depth - 1 )
43+ eprint ('[LOG] Added: {}' .format (url ))
44+ _recursive_get_urls (crawled_urls , max_urls , url , domain , depth - 1 )
2845
2946
3047def get_recursive_urls (parent_url , max_depth , max_urls ):
3148 scraped_urls = [parent_url ]
3249 domain = urlparse (parent_url ).netloc
33- if not 'CHROMEDRIVER_PATH' in os .environ :
34- eprint ('[ERR] CHROMEDRIVER_PATH not provided in env variables' )
35- exit (5 )
36- driver_path = os .environ ['CHROMEDRIVER_PATH' ]
37- assert os .path .exists (driver_path ), 'No such file {}' .format (driver_path )
38- options = webdriver .ChromeOptions ()
39- options .add_argument ("--disable-infobars" )
40- options .add_argument ("--headless" )
41- options .add_argument ("--no-sandbox" )
42- options .add_argument ("--disable-dev-shm-usage" )
43- driver = webdriver .Chrome (driver_path , chrome_options = options )
44- try :
45- driver .get (parent_url )
46- except InvalidArgumentException :
47- eprint ('[ERR] Invalid website' )
48- driver .close ()
49- exit (1 )
50- _recursive_get_urls (scraped_urls , driver , max_urls , parent_url , domain , depth = max_depth )
50+ page = MyPage ()
51+ asyncio .get_event_loop ().run_until_complete (get_page (page , parent_url ))
52+ _recursive_get_urls (scraped_urls , page , max_urls , parent_url , domain , depth = max_depth )
5153 eprint ('[LOG] Finished crawling URLs for {}' .format (parent_url ))
52- driver .close ()
5354 return scraped_urls
0 commit comments