|
5 | 5 | from abc import ABC |
6 | 6 | from datetime import timedelta |
7 | 7 | from typing import TYPE_CHECKING, Any, Generic |
8 | | -from weakref import WeakKeyDictionary |
9 | 8 |
|
10 | 9 | from more_itertools import partition |
11 | 10 | from pydantic import ValidationError |
12 | 11 | from typing_extensions import NotRequired, TypeVar |
13 | 12 |
|
14 | 13 | from crawlee._request import Request, RequestOptions |
15 | | -from crawlee._types import BasicCrawlingContext |
16 | 14 | from crawlee._utils.docs import docs_group |
17 | 15 | from crawlee._utils.time import SharedTimeout |
18 | 16 | from crawlee._utils.urls import to_absolute_url_iterator |
|
28 | 26 | from typing_extensions import Unpack |
29 | 27 |
|
30 | 28 | from crawlee import RequestTransformAction |
31 | | - from crawlee._types import EnqueueLinksKwargs, ExtractLinksFunction |
| 29 | + from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, ExtractLinksFunction |
32 | 30 |
|
33 | 31 | from ._abstract_http_parser import AbstractHttpParser |
34 | 32 |
|
@@ -79,7 +77,7 @@ def __init__( |
79 | 77 | self._parser = parser |
80 | 78 | self._navigation_timeout = navigation_timeout or timedelta(minutes=1) |
81 | 79 | self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = [] |
82 | | - self._shared_navigation_timeouts = WeakKeyDictionary[BasicCrawlingContext, SharedTimeout]() |
| 80 | + self._shared_navigation_timeouts: dict[int, SharedTimeout] = {} |
83 | 81 |
|
84 | 82 | if '_context_pipeline' not in kwargs: |
85 | 83 | raise ValueError( |
@@ -132,13 +130,17 @@ def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpC |
132 | 130 | async def _execute_pre_navigation_hooks( |
133 | 131 | self, context: BasicCrawlingContext |
134 | 132 | ) -> AsyncGenerator[BasicCrawlingContext, None]: |
135 | | - self._shared_navigation_timeouts[context] = SharedTimeout(self._navigation_timeout) |
| 133 | + context_id = id(context) |
| 134 | + self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout) |
136 | 135 |
|
137 | | - for hook in self._pre_navigation_hooks: |
138 | | - async with self._shared_navigation_timeouts[context]: |
139 | | - await hook(context) |
| 136 | + try: |
| 137 | + for hook in self._pre_navigation_hooks: |
| 138 | + async with self._shared_navigation_timeouts[context_id]: |
| 139 | + await hook(context) |
140 | 140 |
|
141 | | - yield context |
| 141 | + yield context |
| 142 | + finally: |
| 143 | + self._shared_navigation_timeouts.pop(context_id, None) |
142 | 144 |
|
143 | 145 | async def _parse_http_response( |
144 | 146 | self, context: HttpCrawlingContext |
@@ -240,7 +242,7 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera |
240 | 242 | Yields: |
241 | 243 | The original crawling context enhanced by HTTP response. |
242 | 244 | """ |
243 | | - async with self._shared_navigation_timeouts[context] as remaining_timeout: |
| 245 | + async with self._shared_navigation_timeouts[id(context)] as remaining_timeout: |
244 | 246 | result = await self._http_client.crawl( |
245 | 247 | request=context.request, |
246 | 248 | session=context.session, |
|
0 commit comments