Skip to content

Commit 5f4d43b

Browse files
author
Alexandru Meterez
committed
Update data collection to use pyppeteer instead of selenium
1 parent f061629 commit 5f4d43b

File tree

1 file changed

+35
-63
lines changed

1 file changed

+35
-63
lines changed

fred/data/collect.py

Lines changed: 35 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,81 +1,53 @@
11
import os
2-
from selenium import webdriver
3-
from selenium.webdriver import DesiredCapabilities
4-
import time
2+
import asyncio
53
import json
6-
import base64
7-
import requests
4+
from pyppeteer import launch
85

9-
SCRIPT = """var body = document.body,
10-
html = document.documentElement;
11-
return Math.max( body.scrollHeight, body.offsetHeight,
12-
html.clientHeight, html.scrollHeight, html.offsetHeight );"""
6+
network = []
7+
javascript = []
138

149

15-
def _chrome_full_screenshot(driver):
16-
def send(cmd, params):
17-
resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
18-
url = driver.command_executor._url + resource
19-
body = json.dumps({'cmd': cmd, 'params': params})
20-
response = driver.command_executor._request('POST', url, body)
21-
return response.get('value')
10+
async def intercept_network_response(response):
11+
print("URL:", response.url)
12+
print(response.status)
13+
network.append(str(response.status) + response.url)
14+
print("\n\n")
2215

23-
def evaluate(script):
24-
response = send('Runtime.evaluate', {'returnByValue': True, 'expression': script})
25-
return response['result']['value']
2616

27-
metrics = evaluate( \
28-
"({" + \
29-
"width: Math.max(window.innerWidth, document.body.scrollWidth, document.documentElement.scrollWidth)|0," + \
30-
"height: Math.max(innerHeight, document.body.scrollHeight, document.documentElement.scrollHeight)|0," + \
31-
"deviceScaleFactor: window.devicePixelRatio || 1," + \
32-
"mobile: typeof window.orientation !== 'undefined'" + \
33-
"})")
34-
send('Emulation.setDeviceMetricsOverride', metrics)
35-
screenshot = send('Page.captureScreenshot', {'format': 'png', 'fromSurface': True})
36-
send('Emulation.clearDeviceMetricsOverride', {})
17+
async def intercept_console(response):
18+
print(response.text)
19+
javascript.append(response.text)
20+
print("\n\n")
3721

38-
return base64.b64decode(screenshot['data'])
3922

23+
async def collect_msgs_and_screenshot(url, ss_path):
24+
browser = await launch()
25+
page = await browser.newPage()
4026

41-
def collect_data(url, output_folder, output_filename, proxy_host, proxy_port):
27+
page.on('response', intercept_network_response)
28+
page.on('console', intercept_console)
29+
page.on('requestfailed', intercept_console)
30+
page.on('pageerror', intercept_console)
31+
page.on('error', intercept_console)
32+
33+
await page.goto(url)
34+
await page.screenshot({'path': ss_path, 'fullPage': True})
35+
36+
await browser.close()
37+
38+
39+
def collect_data(url, output_folder, output_filename):
4240
if not os.path.exists("./tmp"):
4341
os.mkdir("./tmp")
4442
output_folder = os.path.join("./tmp", output_folder)
4543
if not os.path.exists(output_folder):
4644
os.mkdir(output_folder)
47-
options = webdriver.ChromeOptions()
48-
options.add_argument("--start-maximized")
49-
options.add_argument("--force-device-scale-factor=2")
50-
options.add_argument("--disable-infobars")
51-
options.add_argument("--headless")
52-
options.add_argument('--proxy-server=localhost:{}'.format(proxy_port))
53-
options.add_argument('--no-sandbox')
54-
options.add_argument('--disable-dev-shm-usage')
55-
capabilities = DesiredCapabilities.CHROME
56-
capabilities['goog:loggingPrefs'] = {'browser': 'ALL'}
5745

58-
driver = webdriver.Chrome(os.environ['CHROMEDRIVER_PATH'], chrome_options=options,
59-
desired_capabilities=capabilities, service_args=["--verbose"])
60-
driver.maximize_window()
61-
driver.fullscreen_window()
62-
# proxy.new_har("Logs")
63-
requests.put('%s/proxy/%s/har' % (proxy_host, proxy_port), {'initialPageTitle': 'Logs'})
64-
driver.get(url)
65-
print(driver.get_log('browser'))
66-
logs = driver.get_log('browser')
46+
asyncio.get_event_loop().run_until_complete(
47+
collect_msgs_and_screenshot(url, os.path.join(output_folder, output_filename) + '.png'))
48+
6749
with open(os.path.join(output_folder, output_filename.split('.')[0] + '_js_log.json'), 'w') as f:
68-
json.dump(logs, f, indent=2)
69-
with open(os.path.join(output_folder, output_filename.split('.')[0] + '_network_log.json'), 'w') as f:
70-
r = requests.get('%s/proxy/%s/har' % (proxy_host, proxy_port))
71-
json.dump(r.json(), f, indent=2)
50+
json.dump(javascript, f, indent=2)
7251

73-
height = driver.execute_script(SCRIPT)
74-
for i in range(4):
75-
driver.execute_script("window.scrollBy(0, " + str(height / 5) + ")")
76-
time.sleep(3)
77-
png = _chrome_full_screenshot(driver)
78-
with open(os.path.join(output_folder, output_filename), 'wb') as f:
79-
f.write(png)
80-
driver.execute_script("window.scrollTo(0, 0)")
81-
driver.close()
52+
with open(os.path.join(output_folder, output_filename.split('.')[0] + '_network_log.json'), 'w') as f:
53+
json.dump(network, f, indent=2)

0 commit comments

Comments
 (0)