Skip to content

Commit c531eff

Browse files
elacuestaCopilot
andauthored
Scrapy 2.14 compatibility (#359)
* Scrapy 2.14 compatibility (download handler arguments) * Ignore pylint warning * Update default download handler invocation * Bump minimum versions * Paint it black * Update greenlet version * Ensure running event loop * Update incorrect ubuntu worker * Async syntax for download_request and close * Also ignore flake8 warning * Bump minimum versions again * Tests: refactor process killing * Update download handling * Update close method * Twisted tests on pinned versions env * Remove py314 for now * Test coroutine download_request * Update tests/tests_asyncio/test_browser.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 573c32a commit c531eff

File tree

10 files changed

+182
-39
lines changed

10 files changed

+182
-39
lines changed

.github/workflows/tests.yml

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ jobs:
1010
fail-fast: false
1111
matrix:
1212
os: [ubuntu-22.04]
13-
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
13+
python-version: ["3.10", "3.11", "3.12", "3.13"]
1414
include:
1515
- os: macos-14
1616
python-version: "3.12"
@@ -65,3 +65,38 @@ jobs:
6565
$ProgressPreference = 'SilentlyContinue'
6666
Invoke-WebRequest -Uri https://uploader.codecov.io/latest/windows/codecov.exe -Outfile codecov.exe
6767
.\codecov.exe
68+
69+
tests-pinned:
70+
if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
71+
runs-on: ubuntu-22.04
72+
timeout-minutes: 20
73+
74+
steps:
75+
- uses: actions/checkout@v4
76+
77+
- name: Set up Python 3.10
78+
uses: actions/setup-python@v5
79+
with:
80+
python-version: "3.10"
81+
82+
- name: Set up node
83+
uses: actions/setup-node@v4
84+
with:
85+
node-version: 18
86+
87+
- name: Install tox
88+
run: pip install tox
89+
90+
- name: Run asyncio tests with pinned versions
91+
run: tox -e py-pinned
92+
93+
- name: Run twisted tests with pinned versions
94+
run: tox -e py-pinned-twisted
95+
96+
- name: Upload coverage report
97+
env:
98+
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
99+
run: |
100+
curl -Os https://uploader.codecov.io/latest/linux/codecov
101+
chmod +x codecov
102+
./codecov

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ to integrate `asyncio`-based projects such as `Playwright`.
2222

2323
### Minimum required versions
2424

25-
* Python >= 3.9
26-
* Scrapy >= 2.0 (!= 2.4.0)
27-
* Playwright >= 1.15
25+
* Python >= 3.10
26+
* Scrapy >= 2.7
27+
* Playwright >= 1.40
2828

2929

3030
## Installation

scrapy_playwright/handler.py

Lines changed: 49 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,15 @@
2323
Response as PlaywrightResponse,
2424
Route,
2525
)
26-
from scrapy import Spider, signals
26+
from scrapy import Spider, signals, version_info as scrapy_version_info
2727
from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler
2828
from scrapy.crawler import Crawler
2929
from scrapy.exceptions import NotSupported, ScrapyDeprecationWarning
3030
from scrapy.http import Request, Response
3131
from scrapy.http.headers import Headers
3232
from scrapy.responsetypes import responsetypes
3333
from scrapy.settings import Settings
34-
from scrapy.utils.defer import deferred_from_coro
34+
from scrapy.utils.defer import deferred_from_coro, maybe_deferred_to_future
3535
from scrapy.utils.misc import load_object
3636
from scrapy.utils.reactor import verify_installed_reactor
3737
from twisted.internet.defer import Deferred, inlineCallbacks
@@ -52,6 +52,9 @@
5252
__all__ = ["ScrapyPlaywrightDownloadHandler"]
5353

5454

55+
_SCRAPY_ASYNC_API = scrapy_version_info >= (2, 14, 0)
56+
57+
5558
PlaywrightHandler = TypeVar("PlaywrightHandler", bound="ScrapyPlaywrightDownloadHandler")
5659

5760

@@ -138,7 +141,12 @@ class ScrapyPlaywrightDownloadHandler(HTTP11DownloadHandler):
138141
playwright: Optional[AsyncPlaywright] = None
139142

140143
def __init__(self, crawler: Crawler) -> None:
141-
super().__init__(settings=crawler.settings, crawler=crawler)
144+
if _SCRAPY_ASYNC_API:
145+
super().__init__(crawler=crawler)
146+
else:
147+
super().__init__( # pylint: disable=unexpected-keyword-arg
148+
settings=crawler.settings, crawler=crawler
149+
)
142150
verify_installed_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor")
143151
crawler.signals.connect(self._engine_started, signals.engine_started)
144152
self.stats = crawler.stats
@@ -348,13 +356,20 @@ def _set_max_concurrent_context_count(self):
348356
"playwright/context_count/max_concurrent", len(self.context_wrappers)
349357
)
350358

351-
@inlineCallbacks
352-
def close(self) -> Deferred:
353-
logger.info("Closing download handler")
354-
yield super().close()
355-
yield self._deferred_from_coro(self._close())
356-
if self.config.use_threaded_loop:
357-
_ThreadedLoopAdapter.stop(id(self))
359+
if _SCRAPY_ASYNC_API:
360+
361+
async def close(self) -> None:
362+
logger.info("Closing download handler")
363+
await super().close()
364+
await self._close()
365+
366+
else:
367+
368+
@inlineCallbacks
369+
def close(self) -> Deferred: # pylint: disable=invalid-overridden-method
370+
logger.info("Closing download handler")
371+
yield super().close()
372+
yield self._deferred_from_coro(self._close())
358373

359374
async def _close(self) -> None:
360375
with suppress(TargetClosedError):
@@ -367,11 +382,30 @@ async def _close(self) -> None:
367382
await self.playwright_context_manager.__aexit__()
368383
if self.playwright:
369384
await self.playwright.stop()
385+
if self.config.use_threaded_loop:
386+
_ThreadedLoopAdapter.stop(id(self))
387+
388+
if _SCRAPY_ASYNC_API:
389+
390+
async def download_request(self, request: Request) -> Response:
391+
if request.meta.get("playwright"):
392+
return await maybe_deferred_to_future(
393+
self._deferred_from_coro(self._download_request(request, self._crawler.spider))
394+
)
395+
return await super().download_request( # pylint: disable=no-value-for-parameter
396+
request
397+
)
370398

371-
def download_request(self, request: Request, spider: Spider) -> Deferred:
372-
if request.meta.get("playwright"):
373-
return self._deferred_from_coro(self._download_request(request, spider))
374-
return super().download_request(request, spider)
399+
else:
400+
401+
def download_request( # type: ignore[misc] # pylint: disable=invalid-overridden-method,arguments-differ # noqa: E501
402+
self, request: Request, spider: Spider
403+
) -> Deferred:
404+
if request.meta.get("playwright"):
405+
return self._deferred_from_coro(self._download_request(request, spider))
406+
return super().download_request( # pylint: disable=unexpected-keyword-arg
407+
request=request, spider=spider
408+
)
375409

376410
async def _download_request(self, request: Request, spider: Spider) -> Response:
377411
counter = 0
@@ -564,8 +598,7 @@ async def _handle_response(response: PlaywrightResponse) -> None:
564598
response = await page.goto(url=request.url, **page_goto_kwargs)
565599
except PlaywrightError as err:
566600
if not (
567-
self.config.browser_type_name in ("firefox", "webkit")
568-
and "Download is starting" in err.message
601+
"Download is starting" in err.message
569602
or self.config.browser_type_name == "chromium"
570603
and "net::ERR_ABORTED" in err.message
571604
):

setup.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
"Development Status :: 4 - Beta",
2323
"License :: OSI Approved :: BSD License",
2424
"Programming Language :: Python",
25-
"Programming Language :: Python :: 3.9",
2625
"Programming Language :: Python :: 3.10",
2726
"Programming Language :: Python :: 3.11",
2827
"Programming Language :: Python :: 3.12",
@@ -33,9 +32,9 @@
3332
"Topic :: Software Development :: Libraries :: Application Frameworks",
3433
"Topic :: Software Development :: Libraries :: Python Modules",
3534
],
36-
python_requires=">=3.9",
35+
python_requires=">=3.10",
3736
install_requires=[
38-
"scrapy>=2.0,!=2.4.0",
39-
"playwright>=1.15",
37+
"scrapy>=2.7",
38+
"playwright>=1.40",
4039
],
4140
)

tests/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
from scrapy.http.response.html import HtmlResponse
1111
from scrapy.utils.test import get_crawler
1212

13+
from scrapy_playwright.handler import _SCRAPY_ASYNC_API
14+
1315

1416
logger = logging.getLogger("scrapy-playwright-tests")
1517

@@ -54,7 +56,10 @@ async def make_handler(settings_dict: Optional[dict] = None):
5456
else:
5557
yield handler
5658
finally:
57-
await handler._close()
59+
if _SCRAPY_ASYNC_API:
60+
await handler.close()
61+
else:
62+
await handler._close()
5863

5964

6065
def assert_correct_response(response: HtmlResponse, request: Request) -> None:

tests/conftest.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,18 @@ def pytest_configure(config):
88
# https://twistedmatrix.com/trac/ticket/9766
99
# https://github.com/pytest-dev/pytest-twisted/issues/80
1010

11-
if config.getoption("reactor", "default") == "asyncio" and platform.system() == "Windows":
12-
import asyncio
11+
import asyncio
1312

13+
if config.getoption("reactor", "default") == "asyncio" and platform.system() == "Windows":
1414
selector_policy = asyncio.WindowsSelectorEventLoopPolicy()
1515
asyncio.set_event_loop_policy(selector_policy)
1616

17+
# Ensure there's a running event loop for the tests
18+
try:
19+
asyncio.get_running_loop()
20+
except RuntimeError:
21+
asyncio.set_event_loop(asyncio.new_event_loop())
22+
1723

1824
def pytest_sessionstart(session): # pylint: disable=unused-argument
1925
"""

tests/tests_asyncio/test_browser.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
import asyncio
22
import logging
3-
import os
43
import platform
54
import random
65
import re
7-
import signal
86
import subprocess
97
import time
108
import uuid
@@ -148,8 +146,14 @@ def inject_fixtures(self, caplog):
148146
@staticmethod
149147
def kill_chrome():
150148
for proc in psutil.process_iter(["pid", "name"]):
151-
if proc.info["name"].lower() in ("chrome", "chromium"):
152-
os.kill(proc.info["pid"], signal.SIGKILL)
149+
started_time = proc.create_time() # seconds since January 1, 1970 UTC
150+
# only consider processes started in the last 10 seconds
151+
if not started_time >= time.time() - 10:
152+
continue
153+
name = proc.info["name"].lower()
154+
if "chrome" in name or "chromium" in name:
155+
proc.kill()
156+
print("Killed process:", proc)
153157

154158
@allow_windows
155159
async def test_browser_closed_restart(self):

tests/tests_asyncio/test_playwright_requests.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
)
1616
from scrapy import Spider, Request, FormRequest
1717

18-
from scrapy_playwright.handler import DEFAULT_CONTEXT_NAME
18+
from scrapy_playwright.handler import DEFAULT_CONTEXT_NAME, _SCRAPY_ASYNC_API
1919
from scrapy_playwright.page import PageMethod
2020

2121
from tests import allow_windows, make_handler, assert_correct_response
@@ -51,12 +51,22 @@ async def test_basic_response(self):
5151
req = Request(server.urljoin("/index.html"), meta=meta)
5252
resp = await handler._download_request(req, Spider("foo"))
5353

54+
if _SCRAPY_ASYNC_API:
55+
req2 = Request(server.urljoin("/gallery.html"), meta=meta)
56+
resp2 = await handler.download_request(req2)
57+
5458
assert_correct_response(resp, req)
5559
assert resp.css("a::text").getall() == ["Lorem Ipsum", "Infinite Scroll"]
5660
assert isinstance(resp.meta["playwright_page"], PlaywrightPage)
5761
assert resp.meta["playwright_page"].url == resp.url
5862
await resp.meta["playwright_page"].close()
5963

64+
if _SCRAPY_ASYNC_API:
65+
assert_correct_response(resp2, req2)
66+
assert isinstance(resp2.meta["playwright_page"], PlaywrightPage)
67+
assert resp2.meta["playwright_page"].url == resp2.url
68+
await resp2.meta["playwright_page"].close()
69+
6070
@allow_windows
6171
async def test_post_request(self):
6272
async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
@@ -499,7 +509,7 @@ async def test_download_file_delay_error(self):
499509
@allow_windows
500510
async def test_download_file_failure(self):
501511
if self.browser_type != "chromium":
502-
pytest.skip()
512+
pytest.skip("Test only on Chromium")
503513

504514
async def cancel_download(download):
505515
await download.cancel()

tests/tests_twisted/test_mixed_requests.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
from scrapy import Spider
1+
import pytest
2+
from scrapy import Spider, version_info as scrapy_version_info
23
from scrapy.http import Request, Response
34
from scrapy.utils.test import get_crawler
45
from twisted.internet import defer
@@ -8,11 +9,15 @@
89
from tests.mockserver import StaticMockServer
910

1011

12+
@pytest.mark.skipif(scrapy_version_info >= (2, 14, 0), reason="Does not apply to Scrapy >= 2.14.0")
1113
class MixedRequestsTestCase(TestCase):
1214
"""
13-
This test case ensures the handler's 'download_request' method works as expected, and
15+
This test case ensures the handler's 'download_request' method works as expected and
1416
non-playwright requests are processed correctly. The rest of the tests directly call
1517
'_download_request', which is a coroutine ('download_request' returns a Deferred).
18+
19+
Update: since Scrapy 2.14.0, the default download handler uses the async API, returning a
20+
coroutine from the 'download_request' method, making this test case obsolete.
1621
"""
1722

1823
timeout_ms = 500
@@ -50,7 +55,7 @@ def _check_playwright_ok(response, request):
5055
def _check_playwright_error(failure, url):
5156
# different errors depending on the platform
5257
self.assertTrue(
53-
f"Page.goto: net::ERR_CONNECTION_REFUSED at {url}" in str(failure.value)
58+
f"net::ERR_CONNECTION_REFUSED at {url}" in str(failure.value)
5459
or f"Page.goto: Timeout {self.timeout_ms}ms exceeded" in str(failure.value)
5560
)
5661

0 commit comments

Comments
 (0)