Skip to content

Commit 55fdfb9

Browse files
committed
Add a callback cb_selenium
This patch introduces 2 news parameters `cb_selenium` and `cb_selenium_kwargs`. The purpose of the selenium callback is to init the webpage like the scraper wants. In this callback you can use the webdriver to perform some actions and wait a expected page state. Refs: #24, #39
1 parent a670a97 commit 55fdfb9

File tree

4 files changed

+77
-3
lines changed

4 files changed

+77
-3
lines changed

README.md

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def parse_result(self, response):
5757
```
5858

5959
### Additional arguments
60-
The `scrapy_selenium.SeleniumRequest` accept 4 additional arguments:
60+
The `scrapy_selenium.SeleniumRequest` accept 6 additional arguments:
6161

6262
#### `wait_time` / `wait_until`
6363

@@ -97,3 +97,27 @@ yield SeleniumRequest(
9797
script='window.scrollTo(0, document.body.scrollHeight);',
9898
)
9999
```
100+
101+
#### `cb_selenium` / `cb_selenium_kwargs`
102+
When used, the callback is called instead of `webdriver.get(request.url)`. It allows you more
103+
control to put the webpage to the given state that you expected.
104+
```python
105+
def cb_selenium(url, webdriver, arg1):
106+
wait = WebDriverWait(webdriver, timeout=10)
107+
webdriver.get(url)
108+
109+
btn = wait.until(
110+
EC.element_to_be_clickable((By.XPATH, "//button[@class='button']"))
111+
)
112+
btn.click()
113+
114+
wait.until(EC.visibility_of_element_located((By.ID, arg1)))
115+
116+
117+
yield SeleniumRequest(
118+
url=url,
119+
callback=self.parse_result,
120+
cb_selenium=cb_selenium,
121+
cb_selenium_kwargs={"arg1": "123456"},
122+
)
123+
```

scrapy_selenium/http.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
class SeleniumRequest(Request):
77
"""Scrapy ``Request`` subclass providing additional arguments"""
88

9-
def __init__(self, wait_time=None, wait_until=None, screenshot=False, script=None, *args, **kwargs):
9+
def __init__(self, wait_time=None, wait_until=None, screenshot=False,
10+
script=None, cb_selenium=None, cb_selenium_kwargs=None, *args, **kwargs):
1011
"""Initialize a new selenium request
1112
1213
Parameters
@@ -21,12 +22,20 @@ def __init__(self, wait_time=None, wait_until=None, screenshot=False, script=Non
2122
will be returned in the response "meta" attribute.
2223
script: str
2324
JavaScript code to execute.
25+
cb_selenium: method
26+
Selenium handler which contains webdriver actions leading to the expected
27+
state of the web page. The handler takes url, webdriver and custom arguments if needed
28+
`cb_selenium(url, webdriver, arg1, arg2)`.
29+
cb_selenium_kwargs: dict
30+
Keywords arguments for the selenium callback `cb_selenium`.
2431
2532
"""
2633

2734
self.wait_time = wait_time
2835
self.wait_until = wait_until
2936
self.screenshot = screenshot
3037
self.script = script
38+
self.cb_selenium = cb_selenium
39+
self.cb_selenium_kwargs = cb_selenium_kwargs
3140

3241
super().__init__(*args, **kwargs)

scrapy_selenium/middlewares.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,11 @@ def process_request(self, request, spider):
100100
if not isinstance(request, SeleniumRequest):
101101
return None
102102

103-
self.driver.get(request.url)
103+
if callable(request.cb_selenium):
104+
kwargs = request.cb_selenium_kwargs if request.cb_selenium_kwargs else {}
105+
request.cb_selenium(request.url, self.driver, **kwargs)
106+
else:
107+
self.driver.get(request.url)
104108

105109
for cookie_name, cookie_value in request.cookies.items():
106110
self.driver.add_cookie(

tests/test_middlewares.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@
55
from scrapy import Request
66
from scrapy.crawler import Crawler
77

8+
from selenium.webdriver.support.ui import WebDriverWait
9+
from selenium.webdriver.support import expected_conditions as ec
10+
from selenium.webdriver.common.by import By
11+
from selenium.webdriver.common.keys import Keys
12+
813
from scrapy_selenium.http import SeleniumRequest
914
from scrapy_selenium.middlewares import SeleniumMiddleware
1015

@@ -135,3 +140,35 @@ def test_process_request_should_execute_script_if_script_option(self):
135140
html_response.selector.xpath('//title/text()').extract_first(),
136141
'scrapy_selenium'
137142
)
143+
144+
def test_process_request_should_execute_cb_selenium(self):
145+
"""Test that the ``process_request`` should execute cb_selenium and return a response"""
146+
147+
def cb_selenium(url, webdriver, query):
148+
wait = WebDriverWait(webdriver, timeout=10)
149+
150+
webdriver.get(url)
151+
152+
elt = wait.until(ec.visibility_of_element_located((By.ID, "id-search-field")))
153+
elt.send_keys(query + Keys.ENTER)
154+
155+
wait.until(ec.visibility_of_element_located(
156+
(By.XPATH, "//ul[@class='list-recent-events menu']")
157+
))
158+
159+
selenium_request = SeleniumRequest(
160+
url='http://www.python.org',
161+
cb_selenium=cb_selenium,
162+
cb_selenium_kwargs={"query": "python"}
163+
)
164+
165+
html_response = self.selenium_middleware.process_request(
166+
request=selenium_request,
167+
spider=None
168+
)
169+
170+
titles_xpath = "//ul[@class='list-recent-events menu']/li/h3/a/text()"
171+
self.assertIn(
172+
"python",
173+
html_response.selector.xpath(titles_xpath).extract_first().lower()
174+
)

0 commit comments

Comments
 (0)