Skip to content

Commit 55967e8

Browse files
committed
Track shared timers manually
1 parent f4b41f0 commit 55967e8

File tree

2 files changed

+24
-19
lines changed

2 files changed

+24
-19
lines changed

src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,12 @@
55
from abc import ABC
66
from datetime import timedelta
77
from typing import TYPE_CHECKING, Any, Generic
8-
from weakref import WeakKeyDictionary
98

109
from more_itertools import partition
1110
from pydantic import ValidationError
1211
from typing_extensions import NotRequired, TypeVar
1312

1413
from crawlee._request import Request, RequestOptions
15-
from crawlee._types import BasicCrawlingContext
1614
from crawlee._utils.docs import docs_group
1715
from crawlee._utils.time import SharedTimeout
1816
from crawlee._utils.urls import to_absolute_url_iterator
@@ -28,7 +26,7 @@
2826
from typing_extensions import Unpack
2927

3028
from crawlee import RequestTransformAction
31-
from crawlee._types import EnqueueLinksKwargs, ExtractLinksFunction
29+
from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, ExtractLinksFunction
3230

3331
from ._abstract_http_parser import AbstractHttpParser
3432

@@ -79,7 +77,7 @@ def __init__(
7977
self._parser = parser
8078
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
8179
self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
82-
self._shared_navigation_timeouts = WeakKeyDictionary[BasicCrawlingContext, SharedTimeout]()
80+
self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
8381

8482
if '_context_pipeline' not in kwargs:
8583
raise ValueError(
@@ -132,13 +130,17 @@ def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpC
132130
async def _execute_pre_navigation_hooks(
133131
self, context: BasicCrawlingContext
134132
) -> AsyncGenerator[BasicCrawlingContext, None]:
135-
self._shared_navigation_timeouts[context] = SharedTimeout(self._navigation_timeout)
133+
context_id = id(context)
134+
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
136135

137-
for hook in self._pre_navigation_hooks:
138-
async with self._shared_navigation_timeouts[context]:
139-
await hook(context)
136+
try:
137+
for hook in self._pre_navigation_hooks:
138+
async with self._shared_navigation_timeouts[context_id]:
139+
await hook(context)
140140

141-
yield context
141+
yield context
142+
finally:
143+
self._shared_navigation_timeouts.pop(context_id, None)
142144

143145
async def _parse_http_response(
144146
self, context: HttpCrawlingContext
@@ -240,7 +242,7 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera
240242
Yields:
241243
The original crawling context enhanced by HTTP response.
242244
"""
243-
async with self._shared_navigation_timeouts[context] as remaining_timeout:
245+
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
244246
result = await self._http_client.crawl(
245247
request=context.request,
246248
session=context.session,

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from datetime import timedelta
77
from functools import partial
88
from typing import TYPE_CHECKING, Any, Generic, Literal
9-
from weakref import WeakKeyDictionary
109

1110
import playwright.async_api
1211
from more_itertools import partition
@@ -149,7 +148,7 @@ def __init__(
149148
if configuration is not None:
150149
service_locator.set_configuration(configuration)
151150

152-
self._shared_navigation_timeouts = WeakKeyDictionary[BasicCrawlingContext, SharedTimeout]()
151+
self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
153152

154153
if browser_pool:
155154
# Raise an exception if browser_pool is provided together with other browser-related arguments.
@@ -241,14 +240,18 @@ async def _open_page(
241240
block_requests=partial(block_requests, page=crawlee_page.page),
242241
)
243242

244-
self._shared_navigation_timeouts[pre_navigation_context] = SharedTimeout(self._navigation_timeout)
243+
context_id = id(pre_navigation_context)
244+
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
245245

246-
async with browser_page_context(crawlee_page.page):
247-
for hook in self._pre_navigation_hooks:
248-
async with self._shared_navigation_timeouts[context]:
249-
await hook(pre_navigation_context)
246+
try:
247+
async with browser_page_context(crawlee_page.page):
248+
for hook in self._pre_navigation_hooks:
249+
async with self._shared_navigation_timeouts[context_id]:
250+
await hook(pre_navigation_context)
250251

251-
yield pre_navigation_context
252+
yield pre_navigation_context
253+
finally:
254+
self._shared_navigation_timeouts.pop(context_id, None)
252255

253256
def _prepare_request_interceptor(
254257
self,
@@ -316,7 +319,7 @@ async def _navigate(
316319
await context.page.route(context.request.url, route_handler)
317320

318321
try:
319-
async with self._shared_navigation_timeouts[context] as remaining_timeout:
322+
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
320323
response = await context.page.goto(
321324
context.request.url, timeout=remaining_timeout.total_seconds() * 1000
322325
)

0 commit comments

Comments
 (0)