Skip to content

Commit 70cab80

Browse files
committed
Merge remote-tracking branch 'origin/main' into cleanup-deprecated-header-handling
2 parents 811e1b5 + e7dbf9c commit 70cab80

File tree

16 files changed

+277
-71
lines changed

16 files changed

+277
-71
lines changed

.bumpversion.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 0.0.44
2+
current_version = 0.0.46
33
commit = True
44
tag = True
55

.github/workflows/tests.yml

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ jobs:
1010
fail-fast: false
1111
matrix:
1212
os: [ubuntu-22.04]
13-
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
13+
python-version: ["3.10", "3.11", "3.12", "3.13"]
1414
include:
1515
- os: macos-14
1616
python-version: "3.12"
@@ -39,6 +39,14 @@ jobs:
3939
- name: Run twisted tests
4040
run: tox -e py-twisted
4141

42+
- name: Run pinned dependency tests (Windows, asyncio tests)
43+
if: runner.os == 'Windows'
44+
run: tox -e py-pinned
45+
46+
- name: Run pinned dependency tests (Windows, Twisted tests)
47+
if: runner.os == 'Windows'
48+
run: tox -e py-pinned-twisted
49+
4250
- name: Upload coverage report (Linux)
4351
if: runner.os == 'Linux'
4452
env:
@@ -65,3 +73,38 @@ jobs:
6573
$ProgressPreference = 'SilentlyContinue'
6674
Invoke-WebRequest -Uri https://uploader.codecov.io/latest/windows/codecov.exe -Outfile codecov.exe
6775
.\codecov.exe
76+
77+
tests-pinned:
78+
if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
79+
runs-on: ubuntu-22.04
80+
timeout-minutes: 20
81+
82+
steps:
83+
- uses: actions/checkout@v4
84+
85+
- name: Set up Python 3.10
86+
uses: actions/setup-python@v5
87+
with:
88+
python-version: "3.10"
89+
90+
- name: Set up node
91+
uses: actions/setup-node@v4
92+
with:
93+
node-version: 18
94+
95+
- name: Install tox
96+
run: pip install tox
97+
98+
- name: Run asyncio tests with pinned versions
99+
run: tox -e py-pinned
100+
101+
- name: Run twisted tests with pinned versions
102+
run: tox -e py-pinned-twisted
103+
104+
- name: Upload coverage report
105+
env:
106+
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
107+
run: |
108+
curl -Os https://uploader.codecov.io/latest/linux/codecov
109+
chmod +x codecov
110+
./codecov

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ to integrate `asyncio`-based projects such as `Playwright`.
2222

2323
### Minimum required versions
2424

25-
* Python >= 3.9
26-
* Scrapy >= 2.0 (!= 2.4.0)
27-
* Playwright >= 1.15
25+
* Python >= 3.10
26+
* Scrapy >= 2.7
27+
* Playwright >= 1.40
2828

2929

3030
## Installation

docs/changelog.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,16 @@
11
# scrapy-playwright changelog
22

33

4+
### [v0.0.46](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.46) (2026-01-21)
5+
6+
* Threaded loop updates & fixes (#361)
7+
8+
9+
### [v0.0.45](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.45) (2026-01-16)
10+
11+
* Scrapy 2.14 compatibility (#356, #359)
12+
13+
414
### [v0.0.44](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.44) (2025-08-13)
515

616
* Fix crawl getting stuck on Windows with Scrapy>=2.13 (#351)

pylintrc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ disable=
99
too-few-public-methods,
1010
too-many-arguments,
1111
too-many-instance-attributes,
12+
too-many-lines,
1213
# tests
1314
duplicate-code,
1415
import-outside-toplevel,

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
[tool.black]
22
line-length = 99
3+
4+
[tool.pytest.ini_options]
5+
reruns = "2"

scrapy_playwright/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.0.44"
1+
__version__ = "0.0.46"

scrapy_playwright/_utils.py

Lines changed: 49 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import logging
33
import platform
44
import threading
5+
from dataclasses import dataclass
56
from typing import Awaitable, Dict, Iterator, Optional, Tuple, Union
67

78
import scrapy
@@ -104,6 +105,13 @@ async def _get_header_value(
104105
return None
105106

106107

108+
@dataclass
109+
class _QueueItem:
110+
coro: Awaitable
111+
promise: Deferred | asyncio.Future
112+
loop: asyncio.AbstractEventLoop | None = None
113+
114+
107115
class _ThreadedLoopAdapter:
108116
"""Utility class to start an asyncio event loop in a new thread and redirect coroutines.
109117
This allows to run Playwright in a different loop than the Scrapy crawler, allowing to
@@ -116,32 +124,60 @@ class _ThreadedLoopAdapter:
116124
_stop_events: Dict[int, asyncio.Event] = {}
117125

118126
@classmethod
119-
async def _handle_coro(cls, coro: Awaitable, dfd: Deferred) -> None:
127+
async def _handle_coro_deferred(cls, queue_item: _QueueItem) -> None:
120128
from twisted.internet import reactor
121129

130+
dfd: Deferred = queue_item.promise
131+
122132
try:
123-
result = await coro
133+
result = await queue_item.coro
124134
except Exception as exc:
125135
reactor.callFromThread(dfd.errback, failure.Failure(exc))
126136
else:
127137
reactor.callFromThread(dfd.callback, result)
128138

139+
@classmethod
140+
async def _handle_coro_future(cls, queue_item: _QueueItem) -> None:
141+
future: asyncio.Future = queue_item.promise
142+
loop: asyncio.AbstractEventLoop = queue_item.loop # type: ignore[assignment]
143+
try:
144+
result = await queue_item.coro
145+
except Exception as exc:
146+
loop.call_soon_threadsafe(future.set_exception, exc)
147+
else:
148+
loop.call_soon_threadsafe(future.set_result, result)
149+
129150
@classmethod
130151
async def _process_queue(cls) -> None:
131152
while any(not ev.is_set() for ev in cls._stop_events.values()):
132-
coro, dfd = await cls._coro_queue.get()
133-
asyncio.create_task(cls._handle_coro(coro, dfd))
153+
queue_item = await cls._coro_queue.get()
154+
if isinstance(queue_item.promise, asyncio.Future):
155+
asyncio.create_task(cls._handle_coro_future(queue_item))
156+
elif isinstance(queue_item.promise, Deferred):
157+
asyncio.create_task(cls._handle_coro_deferred(queue_item))
134158
cls._coro_queue.task_done()
135159

136160
@classmethod
137-
def _deferred_from_coro(cls, coro) -> Deferred:
161+
def _deferred_from_coro(cls, coro: Awaitable) -> Deferred:
138162
dfd: Deferred = Deferred()
139-
asyncio.run_coroutine_threadsafe(cls._coro_queue.put((coro, dfd)), cls._loop)
163+
queue_item = _QueueItem(coro=coro, promise=dfd)
164+
asyncio.run_coroutine_threadsafe(cls._coro_queue.put(queue_item), cls._loop)
140165
return dfd
141166

142167
@classmethod
143-
def start(cls, caller_id: int) -> None:
144-
cls._stop_events[caller_id] = asyncio.Event()
168+
def _future_from_coro(cls, coro: Awaitable) -> asyncio.Future:
169+
target_loop = asyncio.get_running_loop() # Scrapy thread loop
170+
future: asyncio.Future = asyncio.Future()
171+
queue_item = _QueueItem(coro=coro, promise=future, loop=target_loop)
172+
asyncio.run_coroutine_threadsafe(cls._coro_queue.put(queue_item), cls._loop)
173+
return future
174+
175+
@classmethod
176+
def start(cls, download_handler_id: int) -> None:
177+
"""Start the event loop in a new thread if not already started.
178+
Should be called from the Scrapy thread.
179+
"""
180+
cls._stop_events[download_handler_id] = asyncio.Event()
145181
if not getattr(cls, "_loop", None):
146182
policy = asyncio.DefaultEventLoopPolicy()
147183
if platform.system() == "Windows":
@@ -155,9 +191,11 @@ def start(cls, caller_id: int) -> None:
155191
asyncio.run_coroutine_threadsafe(cls._process_queue(), cls._loop)
156192

157193
@classmethod
158-
def stop(cls, caller_id: int) -> None:
159-
"""Wait until all handlers are closed to stop the event loop and join the thread."""
160-
cls._stop_events[caller_id].set()
194+
def stop(cls, download_handler_id: int) -> None:
195+
"""Wait until all handlers are closed to stop the event loop and join the thread.
196+
Should be called from the Scrapy thread.
197+
"""
198+
cls._stop_events[download_handler_id].set()
161199
if all(ev.is_set() for ev in cls._stop_events.values()):
162200
asyncio.run_coroutine_threadsafe(cls._coro_queue.join(), cls._loop)
163201
cls._loop.call_soon_threadsafe(cls._loop.stop)

scrapy_playwright/handler.py

Lines changed: 66 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@
2121
Response as PlaywrightResponse,
2222
Route,
2323
)
24-
from scrapy import Spider, signals
25-
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
24+
from scrapy import Spider, signals, version_info as scrapy_version_info
25+
from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler
2626
from scrapy.crawler import Crawler
2727
from scrapy.exceptions import NotSupported
2828
from scrapy.http import Request, Response
@@ -50,6 +50,9 @@
5050
__all__ = ["ScrapyPlaywrightDownloadHandler"]
5151

5252

53+
_SCRAPY_ASYNC_API = scrapy_version_info >= (2, 14, 0)
54+
55+
5356
PlaywrightHandler = TypeVar("PlaywrightHandler", bound="ScrapyPlaywrightDownloadHandler")
5457

5558

@@ -131,20 +134,29 @@ def from_settings(cls, settings: Settings) -> "Config":
131134
return cfg
132135

133136

134-
class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler):
137+
class ScrapyPlaywrightDownloadHandler(HTTP11DownloadHandler):
135138
playwright_context_manager: Optional[PlaywrightContextManager] = None
136139
playwright: Optional[AsyncPlaywright] = None
137140

138141
def __init__(self, crawler: Crawler) -> None:
139-
super().__init__(settings=crawler.settings, crawler=crawler)
140142
verify_installed_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor")
141-
crawler.signals.connect(self._engine_started, signals.engine_started)
143+
if _SCRAPY_ASYNC_API:
144+
super().__init__(crawler=crawler)
145+
else:
146+
super().__init__( # pylint: disable=unexpected-keyword-arg
147+
settings=crawler.settings, crawler=crawler
148+
)
142149
self.stats = crawler.stats
143150
self.config = Config.from_settings(crawler.settings)
144151

145152
if self.config.use_threaded_loop:
146153
_ThreadedLoopAdapter.start(id(self))
147154

155+
if _SCRAPY_ASYNC_API:
156+
crawler.signals.connect(self._maybe_launch_in_thread, signals.engine_started)
157+
else:
158+
crawler.signals.connect(self._engine_started, signals.engine_started)
159+
148160
self.browser_launch_lock = asyncio.Lock()
149161
self.context_launch_lock = asyncio.Lock()
150162
self.context_wrappers: Dict[str, BrowserContextWrapper] = {}
@@ -175,10 +187,17 @@ def _deferred_from_coro(self, coro: Awaitable) -> Deferred:
175187
return _ThreadedLoopAdapter._deferred_from_coro(coro)
176188
return deferred_from_coro(coro)
177189

190+
def _maybe_future_from_coro(self, coro: Awaitable) -> Awaitable | asyncio.Future:
191+
if self.config.use_threaded_loop:
192+
return _ThreadedLoopAdapter._future_from_coro(coro)
193+
return coro
194+
178195
def _engine_started(self) -> Deferred:
179-
"""Launch the browser. Use the engine_started signal as it supports returning deferreds."""
180196
return self._deferred_from_coro(self._launch())
181197

198+
async def _maybe_launch_in_thread(self) -> None:
199+
await self._maybe_future_from_coro(self._launch())
200+
182201
async def _launch(self) -> None:
183202
"""Launch Playwright manager and configured startup context(s)."""
184203
logger.info("Starting download handler")
@@ -346,13 +365,24 @@ def _set_max_concurrent_context_count(self):
346365
"playwright/context_count/max_concurrent", len(self.context_wrappers)
347366
)
348367

349-
@inlineCallbacks
350-
def close(self) -> Deferred:
351-
logger.info("Closing download handler")
352-
yield super().close()
353-
yield self._deferred_from_coro(self._close())
354-
if self.config.use_threaded_loop:
355-
_ThreadedLoopAdapter.stop(id(self))
368+
if _SCRAPY_ASYNC_API:
369+
370+
async def close(self) -> None:
371+
logger.info("Closing download handler")
372+
await super().close()
373+
await self._maybe_future_from_coro(self._close())
374+
if self.config.use_threaded_loop:
375+
_ThreadedLoopAdapter.stop(id(self))
376+
377+
else:
378+
379+
@inlineCallbacks
380+
def close(self) -> Deferred: # pylint: disable=invalid-overridden-method
381+
logger.info("Closing download handler")
382+
yield super().close()
383+
yield self._deferred_from_coro(self._close())
384+
if self.config.use_threaded_loop:
385+
_ThreadedLoopAdapter.stop(id(self))
356386

357387
async def _close(self) -> None:
358388
with suppress(TargetClosedError):
@@ -366,12 +396,29 @@ async def _close(self) -> None:
366396
if self.playwright:
367397
await self.playwright.stop()
368398

369-
def download_request(self, request: Request, spider: Spider) -> Deferred:
370-
if request.meta.get("playwright"):
371-
return self._deferred_from_coro(self._download_request(request, spider))
372-
return super().download_request(request, spider)
399+
if _SCRAPY_ASYNC_API:
400+
401+
async def download_request(self, request: Request) -> Response:
402+
if request.meta.get("playwright"):
403+
coro = self._download_request(request)
404+
return await self._maybe_future_from_coro(coro)
405+
return await super().download_request( # pylint: disable=no-value-for-parameter
406+
request
407+
)
408+
409+
else:
410+
411+
def download_request( # type: ignore[misc] # pylint: disable=invalid-overridden-method,arguments-differ # noqa: E501
412+
self, request: Request, spider: Spider
413+
) -> Deferred:
414+
if request.meta.get("playwright"):
415+
return self._deferred_from_coro(self._download_request(request, spider))
416+
return super().download_request( # pylint: disable=unexpected-keyword-arg
417+
request=request, spider=spider
418+
)
373419

374-
async def _download_request(self, request: Request, spider: Spider) -> Response:
420+
async def _download_request(self, request: Request, spider: Spider | None = None) -> Response:
421+
spider = spider or self._crawler.spider
375422
counter = 0
376423
while True:
377424
try:
@@ -562,8 +609,7 @@ async def _handle_response(response: PlaywrightResponse) -> None:
562609
response = await page.goto(url=request.url, **page_goto_kwargs)
563610
except PlaywrightError as err:
564611
if not (
565-
self.config.browser_type_name in ("firefox", "webkit")
566-
and "Download is starting" in err.message
612+
"Download is starting" in err.message
567613
or self.config.browser_type_name == "chromium"
568614
and "net::ERR_ABORTED" in err.message
569615
):

setup.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
"Development Status :: 4 - Beta",
2323
"License :: OSI Approved :: BSD License",
2424
"Programming Language :: Python",
25-
"Programming Language :: Python :: 3.9",
2625
"Programming Language :: Python :: 3.10",
2726
"Programming Language :: Python :: 3.11",
2827
"Programming Language :: Python :: 3.12",
@@ -33,9 +32,9 @@
3332
"Topic :: Software Development :: Libraries :: Application Frameworks",
3433
"Topic :: Software Development :: Libraries :: Python Modules",
3534
],
36-
python_requires=">=3.9",
35+
python_requires=">=3.10",
3736
install_requires=[
38-
"scrapy>=2.0,!=2.4.0",
39-
"playwright>=1.15",
37+
"scrapy>=2.7",
38+
"playwright>=1.40",
4039
],
4140
)

0 commit comments

Comments
 (0)