Skip to content

Commit f885027

Browse files
committed
feat(proxy control): Force a proxy at request level at any given point
And merge request's meta with response's meta
1 parent 6e0277f commit f885027

File tree

9 files changed

+54
-15
lines changed

9 files changed

+54
-15
lines changed

scrapling/engines/_browsers/_base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ def _page_generator(
175175
proxy: Optional[ProxyType] = None,
176176
) -> Generator["PageInfo[Page]", None, None]:
177177
"""Acquire a page - either from persistent context or fresh context with proxy."""
178-
if self._config.proxy_rotator:
178+
if proxy:
179179
# Rotation mode: create fresh context with the provided proxy
180180
if not self.browser: # pragma: no cover
181181
raise RuntimeError("Browser not initialized for proxy rotation mode")
@@ -344,7 +344,7 @@ async def _page_generator(
344344
proxy: Optional[ProxyType] = None,
345345
) -> AsyncGenerator["PageInfo[AsyncPage]", None]:
346346
"""Acquire a page - either from persistent context or fresh context with proxy."""
347-
if self._config.proxy_rotator:
347+
if proxy:
348348
# Rotation mode: create fresh context with the provided proxy
349349
if not self.browser: # pragma: no cover
350350
raise RuntimeError("Browser not initialized for proxy rotation mode")

scrapling/engines/_browsers/_controllers.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,11 @@ def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:
115115
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
116116
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
117117
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
118+
:param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
118119
:return: A `Response` object.
119120
"""
121+
static_proxy = kwargs.pop("proxy", None)
122+
120123
params = _validate(kwargs, self, PlaywrightConfig)
121124
if not self._is_alive: # pragma: no cover
122125
raise RuntimeError("Context manager has been closed")
@@ -129,7 +132,10 @@ def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:
129132
)
130133

131134
for attempt in range(self._config.retries):
132-
proxy = self._config.proxy_rotator.get_proxy() if self._config.proxy_rotator else None
135+
if self._config.proxy_rotator and static_proxy is None:
136+
proxy = self._config.proxy_rotator.get_proxy()
137+
else:
138+
proxy = static_proxy
133139

134140
with self._page_generator(
135141
params.timeout, params.extra_headers, params.disable_resources, proxy
@@ -162,7 +168,7 @@ def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:
162168
page.wait_for_timeout(params.wait)
163169

164170
response = ResponseFactory.from_playwright_response(
165-
page, first_response, final_response[0], params.selector_config
171+
page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
166172
)
167173
return response
168174

@@ -276,8 +282,11 @@ async def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Resp
276282
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
277283
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
278284
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
285+
:param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
279286
:return: A `Response` object.
280287
"""
288+
static_proxy = kwargs.pop("proxy", None)
289+
281290
params = _validate(kwargs, self, PlaywrightConfig)
282291

283292
if not self._is_alive: # pragma: no cover
@@ -291,7 +300,10 @@ async def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Resp
291300
)
292301

293302
for attempt in range(self._config.retries):
294-
proxy = self._config.proxy_rotator.get_proxy() if self._config.proxy_rotator else None
303+
if self._config.proxy_rotator and static_proxy is None:
304+
proxy = self._config.proxy_rotator.get_proxy()
305+
else:
306+
proxy = static_proxy
295307

296308
async with self._page_generator(
297309
params.timeout, params.extra_headers, params.disable_resources, proxy
@@ -324,7 +336,7 @@ async def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Resp
324336
await page.wait_for_timeout(params.wait)
325337

326338
response = await ResponseFactory.from_async_playwright_response(
327-
page, first_response, final_response[0], params.selector_config
339+
page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
328340
)
329341
return response
330342

scrapling/engines/_browsers/_stealth.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -204,8 +204,11 @@ def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:
204204
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
205205
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
206206
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
207+
:param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
207208
:return: A `Response` object.
208209
"""
210+
static_proxy = kwargs.pop("proxy", None)
211+
209212
params = _validate(kwargs, self, StealthConfig)
210213
if not self._is_alive: # pragma: no cover
211214
raise RuntimeError("Context manager has been closed")
@@ -218,7 +221,10 @@ def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:
218221
)
219222

220223
for attempt in range(self._config.retries):
221-
proxy = self._config.proxy_rotator.get_proxy() if self._config.proxy_rotator else None
224+
if self._config.proxy_rotator and static_proxy is None:
225+
proxy = self._config.proxy_rotator.get_proxy()
226+
else:
227+
proxy = static_proxy
222228

223229
with self._page_generator(
224230
params.timeout, params.extra_headers, params.disable_resources, proxy
@@ -256,7 +262,7 @@ def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:
256262
page.wait_for_timeout(params.wait)
257263

258264
response = ResponseFactory.from_playwright_response(
259-
page, first_response, final_response[0], params.selector_config
265+
page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
260266
)
261267
return response
262268

@@ -454,8 +460,11 @@ async def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Respons
454460
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
455461
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
456462
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
463+
:param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
457464
:return: A `Response` object.
458465
"""
466+
static_proxy = kwargs.pop("proxy", None)
467+
459468
params = _validate(kwargs, self, StealthConfig)
460469

461470
if not self._is_alive: # pragma: no cover
@@ -469,7 +478,10 @@ async def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Respons
469478
)
470479

471480
for attempt in range(self._config.retries):
472-
proxy = self._config.proxy_rotator.get_proxy() if self._config.proxy_rotator else None
481+
if self._config.proxy_rotator and static_proxy is None:
482+
proxy = self._config.proxy_rotator.get_proxy()
483+
else:
484+
proxy = static_proxy
473485

474486
async with self._page_generator(
475487
params.timeout, params.extra_headers, params.disable_resources, proxy
@@ -507,7 +519,7 @@ async def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Respons
507519
await page.wait_for_timeout(params.wait)
508520

509521
response = await ResponseFactory.from_async_playwright_response(
510-
page, first_response, final_response[0], params.selector_config
522+
page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
511523
)
512524
return response
513525

scrapling/engines/_browsers/_types.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ class PlaywrightFetchParams(TypedDict, total=False):
9999
selector_config: Optional[Dict]
100100
extra_headers: Optional[Dict[str, str]]
101101
wait_selector_state: SelectorWaitStates
102+
proxy: Optional[str | Dict[str, str]]
102103

103104
class StealthSession(PlaywrightSession, total=False):
104105
allow_webgl: bool

scrapling/engines/static.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ def _make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool]
250250
request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
251251
try:
252252
response = session.request(method, **request_args)
253-
result = ResponseFactory.from_http_request(response, selector_config)
253+
result = ResponseFactory.from_http_request(response, selector_config, meta={"proxy": proxy})
254254
return result
255255
except CurlError as e: # pragma: no cover
256256
if attempt < max_retries - 1:
@@ -466,7 +466,7 @@ async def _make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[
466466
request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
467467
try:
468468
response = await session.request(method, **request_args)
469-
result = ResponseFactory.from_http_request(response, selector_config)
469+
result = ResponseFactory.from_http_request(response, selector_config, meta={"proxy": proxy})
470470
return result
471471
except CurlError as e: # pragma: no cover
472472
if attempt < max_retries - 1:

scrapling/engines/toolbelt/convertor.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ def from_playwright_response(
8585
first_response: SyncResponse,
8686
final_response: Optional[SyncResponse],
8787
parser_arguments: Dict,
88+
meta: Optional[Dict] = None,
8889
) -> Response:
8990
"""
9091
Transforms a Playwright response into an internal `Response` object, encapsulating
@@ -134,6 +135,7 @@ def from_playwright_response(
134135
"headers": first_response.all_headers(),
135136
"request_headers": first_response.request.all_headers(),
136137
"history": history,
138+
"meta": meta,
137139
**parser_arguments,
138140
}
139141
)
@@ -220,6 +222,7 @@ async def from_async_playwright_response(
220222
first_response: AsyncResponse,
221223
final_response: Optional[AsyncResponse],
222224
parser_arguments: Dict,
225+
meta: Optional[Dict] = None,
223226
) -> Response:
224227
"""
225228
Transforms a Playwright response into an internal `Response` object, encapsulating
@@ -269,16 +272,18 @@ async def from_async_playwright_response(
269272
"headers": await first_response.all_headers(),
270273
"request_headers": await first_response.request.all_headers(),
271274
"history": history,
275+
"meta": meta,
272276
**parser_arguments,
273277
}
274278
)
275279

276280
@staticmethod
277-
def from_http_request(response: CurlResponse, parser_arguments: Dict) -> Response:
281+
def from_http_request(response: CurlResponse, parser_arguments: Dict, meta: Optional[Dict] = None) -> Response:
278282
"""Takes `curl_cffi` response and generates `Response` object from it.
279283
280284
:param response: `curl_cffi` response object
281285
:param parser_arguments: Additional arguments to be passed to the `Response` object constructor.
286+
:param meta: Optional metadata dictionary to attach to the Response.
282287
:return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
283288
"""
284289
return Response(
@@ -293,6 +298,7 @@ def from_http_request(response: CurlResponse, parser_arguments: Dict) -> Respons
293298
"request_headers": dict(response.request.headers) if response.request else {},
294299
"method": response.request.method if response.request else "GET",
295300
"history": response.history, # https://github.com/lexiforest/curl_cffi/issues/82
301+
"meta": meta,
296302
**parser_arguments,
297303
}
298304
)

scrapling/engines/toolbelt/custom.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ def __init__(
3939
encoding: str = "utf-8",
4040
method: str = "GET",
4141
history: List | None = None,
42+
meta: Dict[str, Any] | None = None,
4243
**selector_config: Any,
4344
):
4445
adaptive_domain: str = cast(str, selector_config.pop("adaptive_domain", ""))
@@ -57,7 +58,10 @@ def __init__(
5758
# For easier debugging while working from a Python shell
5859
log.info(f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})")
5960

60-
self.meta: Dict[str, Any] = {}
61+
if meta and not isinstance(meta, dict):
62+
raise TypeError(f"Response meta should be dictionary but got {type(meta).__name__} instead!")
63+
64+
self.meta: Dict[str, Any] = meta or {}
6165
self.request: Optional["Request"] = None # Will be set by crawler
6266

6367
def follow(

scrapling/spiders/engine.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,9 @@ async def _process_request(self, request: Request) -> None:
113113
retry_request._retry_count += 1
114114
retry_request.priority -= 1 # Don't retry immediately
115115
retry_request.dont_filter = True
116+
retry_request._session_kwargs.pop("proxy", None)
117+
retry_request._session_kwargs.pop("proxies", None)
118+
116119
new_request = await self.spider.retry_blocked_request(retry_request, response)
117120
self._normalize_request(new_request)
118121
await self.scheduler.enqueue(new_request)

scrapling/spiders/session.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,8 @@ async def fetch(self, request: Request) -> Response:
124124
response = await session.fetch(url=request.url, **request._session_kwargs)
125125

126126
response.request = request
127-
response.meta = request.meta
127+
# Merge request meta into response meta (response meta takes priority)
128+
response.meta = {**request.meta, **response.meta}
128129
return response
129130
raise RuntimeError("No session found with the request session id")
130131

0 commit comments

Comments
 (0)