Skip to content

Commit 6b695bf

Browse files
Merge pull request #2 from adobe/dev.pyppeteer-usage
Dev.pyppeteer usage
2 parents 268c844 + 6b2461f commit 6b695bf

23 files changed

+83
-956
lines changed

fred/.gitignore.swp

-12 KB
Binary file not shown.

fred/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from flask import Flask
2+
3+
app = Flask(__name__, static_url_path='')

fred/config/status_codes.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,4 @@
33
2: 'Prediction script failed',
44
3: 'Failed to launch inference process',
55
4: 'Model does not exist',
6-
5: 'CHROMEDRIVER_PATH not provided in env variables'
76
}

fred/crawler/crawler.py

Lines changed: 28 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,33 @@
11
from urllib.parse import urlparse
2-
from selenium import webdriver
32
from bs4 import BeautifulSoup
4-
from selenium.common.exceptions import InvalidArgumentException
5-
import os
63
from utils.utils import eprint
4+
from pyppeteer import launch
5+
import asyncio
76

87

9-
def _recursive_get_urls(crawled_urls, driver, max_urls, parent_url, domain, depth=0):
8+
class MyPage(object):
9+
def __init__(self):
10+
self.source = None
11+
12+
def set(self, source):
13+
self.source = source
14+
15+
16+
async def get_page(test_page, url):
17+
browser = await launch()
18+
page = await browser.newPage()
19+
await page.goto(url)
20+
out = await page.content()
21+
test_page.set(out)
22+
await browser.close()
23+
24+
25+
def _recursive_get_urls(crawled_urls, test_page, max_urls, parent_url, domain, depth=0):
1026
if depth == 0 or len(crawled_urls) == max_urls:
1127
return crawled_urls
12-
driver.get(parent_url)
13-
html = driver.page_source.encode('utf-8')
28+
asyncio.get_event_loop().run_until_complete(get_page(test_page, parent_url))
29+
30+
html = test_page.source
1431
soup = BeautifulSoup(html, features='html.parser')
1532

1633
urls = soup.findAll('a')
@@ -23,31 +40,15 @@ def _recursive_get_urls(crawled_urls, driver, max_urls, parent_url, domain, dept
2340
if urlparse(url).netloc == domain and url not in crawled_urls:
2441
if len(crawled_urls) <= max_urls:
2542
crawled_urls.append(url)
26-
print('[LOG] Added: {}'.format(url))
27-
_recursive_get_urls(crawled_urls, driver, max_urls, url, domain, depth - 1)
43+
eprint('[LOG] Added: {}'.format(url))
44+
_recursive_get_urls(crawled_urls, max_urls, url, domain, depth - 1)
2845

2946

3047
def get_recursive_urls(parent_url, max_depth, max_urls):
3148
scraped_urls = [parent_url]
3249
domain = urlparse(parent_url).netloc
33-
if not 'CHROMEDRIVER_PATH' in os.environ:
34-
eprint('[ERR] CHROMEDRIVER_PATH not provided in env variables')
35-
exit(5)
36-
driver_path = os.environ['CHROMEDRIVER_PATH']
37-
assert os.path.exists(driver_path), 'No such file {}'.format(driver_path)
38-
options = webdriver.ChromeOptions()
39-
options.add_argument("--disable-infobars")
40-
options.add_argument("--headless")
41-
options.add_argument("--no-sandbox")
42-
options.add_argument("--disable-dev-shm-usage")
43-
driver = webdriver.Chrome(driver_path, chrome_options=options)
44-
try:
45-
driver.get(parent_url)
46-
except InvalidArgumentException:
47-
eprint('[ERR] Invalid website')
48-
driver.close()
49-
exit(1)
50-
_recursive_get_urls(scraped_urls, driver, max_urls, parent_url, domain, depth=max_depth)
50+
page = MyPage()
51+
asyncio.get_event_loop().run_until_complete(get_page(page, parent_url))
52+
_recursive_get_urls(scraped_urls, page, max_urls, parent_url, domain, depth=max_depth)
5153
eprint('[LOG] Finished crawling URLs for {}'.format(parent_url))
52-
driver.close()
5354
return scraped_urls

fred/data/collect.py

Lines changed: 28 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,34 @@
11
import os
2-
from selenium import webdriver
3-
from selenium.webdriver import DesiredCapabilities
4-
import time
2+
import asyncio
53
import json
6-
import base64
7-
from run import proxy
4+
from pyppeteer import launch
85

9-
SCRIPT = """var body = document.body,
10-
html = document.documentElement;
11-
return Math.max( body.scrollHeight, body.offsetHeight,
12-
html.clientHeight, html.scrollHeight, html.offsetHeight );"""
6+
network = []
7+
javascript = []
138

149

15-
def _chrome_full_screenshot(driver):
16-
def send(cmd, params):
17-
resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
18-
url = driver.command_executor._url + resource
19-
body = json.dumps({'cmd': cmd, 'params': params})
20-
response = driver.command_executor._request('POST', url, body)
21-
return response.get('value')
10+
async def intercept_network_response(response):
11+
network.append(str(response.status) + response.url)
2212

23-
def evaluate(script):
24-
response = send('Runtime.evaluate', {'returnByValue': True, 'expression': script})
25-
return response['result']['value']
2613

27-
metrics = evaluate( \
28-
"({" + \
29-
"width: Math.max(window.innerWidth, document.body.scrollWidth, document.documentElement.scrollWidth)|0," + \
30-
"height: Math.max(innerHeight, document.body.scrollHeight, document.documentElement.scrollHeight)|0," + \
31-
"deviceScaleFactor: window.devicePixelRatio || 1," + \
32-
"mobile: typeof window.orientation !== 'undefined'" + \
33-
"})")
34-
send('Emulation.setDeviceMetricsOverride', metrics)
35-
screenshot = send('Page.captureScreenshot', {'format': 'png', 'fromSurface': True})
36-
send('Emulation.clearDeviceMetricsOverride', {})
14+
async def intercept_console(response):
15+
javascript.append(response.text)
3716

38-
return base64.b64decode(screenshot['data'])
17+
18+
async def collect_msgs_and_screenshot(url, ss_path):
19+
browser = await launch()
20+
page = await browser.newPage()
21+
22+
page.on('response', intercept_network_response)
23+
page.on('console', intercept_console)
24+
page.on('requestfailed', intercept_console)
25+
page.on('pageerror', intercept_console)
26+
page.on('error', intercept_console)
27+
28+
await page.goto(url)
29+
await page.screenshot({'path': ss_path, 'fullPage': True})
30+
31+
await browser.close()
3932

4033

4134
def collect_data(url, output_folder, output_filename):
@@ -45,36 +38,11 @@ def collect_data(url, output_folder, output_filename):
4538
if not os.path.exists(output_folder):
4639
os.mkdir(output_folder)
4740

48-
options = webdriver.ChromeOptions()
49-
options.add_argument("--start-maximized")
50-
options.add_argument("--force-device-scale-factor=2")
51-
options.add_argument("--disable-infobars")
52-
options.add_argument("--headless")
53-
options.add_argument('--proxy-server=%s' % proxy.proxy)
54-
options.add_argument('--no-sandbox')
55-
options.add_argument('--disable-dev-shm-usage')
56-
capabilities = DesiredCapabilities.CHROME
57-
capabilities['goog:loggingPrefs'] = {'browser': 'ALL'}
41+
asyncio.get_event_loop().run_until_complete(
42+
collect_msgs_and_screenshot(url, os.path.join(output_folder, output_filename)))
5843

59-
driver = webdriver.Chrome(os.environ['CHROMEDRIVER_PATH'], chrome_options=options,
60-
desired_capabilities=capabilities, service_args=["--verbose"])
61-
driver.maximize_window()
62-
driver.fullscreen_window()
63-
proxy.new_har("Logs")
64-
driver.get(url)
65-
66-
logs = driver.get_log('browser')
6744
with open(os.path.join(output_folder, output_filename.split('.')[0] + '_js_log.json'), 'w') as f:
68-
json.dump(logs, f, indent=2)
69-
with open(os.path.join(output_folder, output_filename.split('.')[0] + '_network_log.json'), 'w') as f:
70-
json.dump(proxy.har, f, indent=2)
45+
json.dump(javascript, f, indent=2)
7146

72-
height = driver.execute_script(SCRIPT)
73-
for i in range(4):
74-
driver.execute_script("window.scrollBy(0, " + str(height / 5) + ")")
75-
time.sleep(3)
76-
png = _chrome_full_screenshot(driver)
77-
with open(os.path.join(output_folder, output_filename), 'wb') as f:
78-
f.write(png)
79-
driver.execute_script("window.scrollTo(0, 0)")
80-
driver.close()
47+
with open(os.path.join(output_folder, output_filename.split('.')[0] + '_network_log.json'), 'w') as f:
48+
json.dump(network, f, indent=2)

fred/data/log_preprocess.py

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import json
2-
from urllib.parse import urlparse
32
import re
43

54

@@ -11,23 +10,11 @@ def __init__(self, log_file):
1110
def _js_messages(self):
1211
messages = []
1312
for log in self.logs:
14-
processed_message = re.sub(r'http:\S+', '', log['message'])
13+
processed_message = re.sub(r'http:\S+', '', log)
1514
messages.append(processed_message)
1615
return messages
1716

1817

1918
class NetworkLogPreprocessor(object):
2019
def __init__(self, log_file):
21-
self.logs = json.load(open(log_file, 'r'))
22-
self.network_messages = self._network_messages()
23-
24-
def _network_messages(self):
25-
messages = []
26-
for entry in self.logs['log']['entries']:
27-
request_text = entry['request']['url']
28-
request_path = urlparse(request_text).path
29-
response = entry['response']['status']
30-
message = request_path + str(response)
31-
32-
messages.append(message)
33-
return messages
20+
self.network_messages = json.load(open(log_file, 'r'))

fred/run.py

Lines changed: 20 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
1-
from flask import Flask
21
from flask_restful import Api
32
from flask_cors import CORS
43
from endpoints.verify import Verify
54
from endpoints.get_ids import IDList
65
from endpoints.shutdown import Shutdown
76
from endpoints.verify import states, id_to_urls
87
from endpoints.get_result import Result
9-
from apscheduler.schedulers.background import BackgroundScheduler
10-
from flask import Flask, request, send_from_directory
11-
from browsermobproxy import Server
8+
from flask import send_from_directory
9+
import sys
10+
11+
sys.path.append('../')
12+
from fred import app
1213

1314

1415
def clear_ended():
@@ -18,31 +19,25 @@ def clear_ended():
1819
del id_to_urls[k]
1920

2021

21-
app = Flask(__name__, static_url_path='')
22-
23-
2422
@app.route('/static/<path:path>')
2523
def send_js(path):
2624
return send_from_directory('frontend', path)
2725

2826

29-
if app.config["DEBUG"]:
30-
@app.after_request
31-
def after_request(response):
32-
response.headers["Cache-Control"] = "no-cache, no-store, must-revalidate, public, max-age=0"
33-
response.headers["Expires"] = 0
34-
response.headers["Pragma"] = "no-cache"
35-
return response
36-
server = Server("utils/browsermob_proxy/bin/browsermob-proxy", options={'port': 8090})
37-
server.start()
38-
proxy = server.create_proxy()
39-
cors = CORS(app, resources={r"*": {"origins": "*"}})
40-
api = Api(app)
41-
api.add_resource(Verify, "/api/verify")
42-
api.add_resource(IDList, "/api/ids")
43-
api.add_resource(Shutdown, "/api/shutdown")
44-
api.add_resource(Result, "/api/result")
45-
4627
if __name__ == '__main__':
28+
if app.config["DEBUG"]:
29+
@app.after_request
30+
def after_request(response):
31+
response.headers["Cache-Control"] = "no-cache, no-store, must-revalidate, public, max-age=0"
32+
response.headers["Expires"] = 0
33+
response.headers["Pragma"] = "no-cache"
34+
return response
35+
cors = CORS(app, resources={r"*": {"origins": "*"}})
36+
# print(hex(id(app)))
37+
api = Api(app)
38+
api.add_resource(Verify, "/api/verify")
39+
api.add_resource(IDList, "/api/ids")
40+
api.add_resource(Shutdown, "/api/shutdown")
41+
api.add_resource(Result, "/api/result")
42+
4743
app.run(host='0.0.0.0', debug=True)
48-
server.stop()

0 commit comments

Comments
 (0)