feat(proxy control): Force a proxy at request level at any given point

D4Vinci · D4Vinci · commit f88502718fe0 · 2026-02-02T14:26:52.000+02:00
And merge request's meta with response's meta
diff --git a/scrapling/engines/_browsers/_base.py b/scrapling/engines/_browsers/_base.py
@@ -175,7 +175,7 @@ def _page_generator(
         proxy: Optional[ProxyType] = None,
     ) -> Generator["PageInfo[Page]", None, None]:
         """Acquire a page - either from persistent context or fresh context with proxy."""
-        if self._config.proxy_rotator:
+        if proxy:
             # Rotation mode: create fresh context with the provided proxy
             if not self.browser:  # pragma: no cover
                 raise RuntimeError("Browser not initialized for proxy rotation mode")
@@ -344,7 +344,7 @@ async def _page_generator(
         proxy: Optional[ProxyType] = None,
     ) -> AsyncGenerator["PageInfo[AsyncPage]", None]:
         """Acquire a page - either from persistent context or fresh context with proxy."""
-        if self._config.proxy_rotator:
+        if proxy:
             # Rotation mode: create fresh context with the provided proxy
             if not self.browser:  # pragma: no cover
                 raise RuntimeError("Browser not initialized for proxy rotation mode")
diff --git a/scrapling/engines/_browsers/_controllers.py b/scrapling/engines/_browsers/_controllers.py
@@ -115,8 +115,11 @@ def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
         :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
         :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
+        :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
         :return: A `Response` object.
         """
+        static_proxy = kwargs.pop("proxy", None)
+
         params = _validate(kwargs, self, PlaywrightConfig)
         if not self._is_alive:  # pragma: no cover
             raise RuntimeError("Context manager has been closed")
@@ -129,7 +132,10 @@ def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:
         )
 
         for attempt in range(self._config.retries):
-            proxy = self._config.proxy_rotator.get_proxy() if self._config.proxy_rotator else None
+            if self._config.proxy_rotator and static_proxy is None:
+                proxy = self._config.proxy_rotator.get_proxy()
+            else:
+                proxy = static_proxy
 
             with self._page_generator(
                 params.timeout, params.extra_headers, params.disable_resources, proxy
@@ -162,7 +168,7 @@ def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:
                     page.wait_for_timeout(params.wait)
 
                     response = ResponseFactory.from_playwright_response(
-                        page, first_response, final_response[0], params.selector_config
+                        page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
                     )
                     return response
 
@@ -276,8 +282,11 @@ async def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Resp
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
         :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
         :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
+        :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
         :return: A `Response` object.
         """
+        static_proxy = kwargs.pop("proxy", None)
+
         params = _validate(kwargs, self, PlaywrightConfig)
 
         if not self._is_alive:  # pragma: no cover
@@ -291,7 +300,10 @@ async def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Resp
         )
 
         for attempt in range(self._config.retries):
-            proxy = self._config.proxy_rotator.get_proxy() if self._config.proxy_rotator else None
+            if self._config.proxy_rotator and static_proxy is None:
+                proxy = self._config.proxy_rotator.get_proxy()
+            else:
+                proxy = static_proxy
 
             async with self._page_generator(
                 params.timeout, params.extra_headers, params.disable_resources, proxy
@@ -324,7 +336,7 @@ async def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Resp
                     await page.wait_for_timeout(params.wait)
 
                     response = await ResponseFactory.from_async_playwright_response(
-                        page, first_response, final_response[0], params.selector_config
+                        page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
                     )
                     return response
 
diff --git a/scrapling/engines/_browsers/_stealth.py b/scrapling/engines/_browsers/_stealth.py
@@ -204,8 +204,11 @@ def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:
         :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
         :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
         :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
+        :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
         :return: A `Response` object.
         """
+        static_proxy = kwargs.pop("proxy", None)
+
         params = _validate(kwargs, self, StealthConfig)
         if not self._is_alive:  # pragma: no cover
             raise RuntimeError("Context manager has been closed")
@@ -218,7 +221,10 @@ def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:
         )
 
         for attempt in range(self._config.retries):
-            proxy = self._config.proxy_rotator.get_proxy() if self._config.proxy_rotator else None
+            if self._config.proxy_rotator and static_proxy is None:
+                proxy = self._config.proxy_rotator.get_proxy()
+            else:
+                proxy = static_proxy
 
             with self._page_generator(
                 params.timeout, params.extra_headers, params.disable_resources, proxy
@@ -256,7 +262,7 @@ def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:
                     page.wait_for_timeout(params.wait)
 
                     response = ResponseFactory.from_playwright_response(
-                        page, first_response, final_response[0], params.selector_config
+                        page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
                     )
                     return response
 
@@ -454,8 +460,11 @@ async def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Respons
         :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
         :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
         :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
+        :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
         :return: A `Response` object.
         """
+        static_proxy = kwargs.pop("proxy", None)
+
         params = _validate(kwargs, self, StealthConfig)
 
         if not self._is_alive:  # pragma: no cover
@@ -469,7 +478,10 @@ async def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Respons
         )
 
         for attempt in range(self._config.retries):
-            proxy = self._config.proxy_rotator.get_proxy() if self._config.proxy_rotator else None
+            if self._config.proxy_rotator and static_proxy is None:
+                proxy = self._config.proxy_rotator.get_proxy()
+            else:
+                proxy = static_proxy
 
             async with self._page_generator(
                 params.timeout, params.extra_headers, params.disable_resources, proxy
@@ -507,7 +519,7 @@ async def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Respons
                     await page.wait_for_timeout(params.wait)
 
                     response = await ResponseFactory.from_async_playwright_response(
-                        page, first_response, final_response[0], params.selector_config
+                        page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
                     )
                     return response
 
diff --git a/scrapling/engines/_browsers/_types.py b/scrapling/engines/_browsers/_types.py
@@ -99,6 +99,7 @@ class PlaywrightFetchParams(TypedDict, total=False):
         selector_config: Optional[Dict]
         extra_headers: Optional[Dict[str, str]]
         wait_selector_state: SelectorWaitStates
+        proxy: Optional[str | Dict[str, str]]
 
     class StealthSession(PlaywrightSession, total=False):
         allow_webgl: bool
diff --git a/scrapling/engines/static.py b/scrapling/engines/static.py
@@ -250,7 +250,7 @@ def _make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool]
                 request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
                 try:
                     response = session.request(method, **request_args)
-                    result = ResponseFactory.from_http_request(response, selector_config)
+                    result = ResponseFactory.from_http_request(response, selector_config, meta={"proxy": proxy})
                     return result
                 except CurlError as e:  # pragma: no cover
                     if attempt < max_retries - 1:
@@ -466,7 +466,7 @@ async def _make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[
                 request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
                 try:
                     response = await session.request(method, **request_args)
-                    result = ResponseFactory.from_http_request(response, selector_config)
+                    result = ResponseFactory.from_http_request(response, selector_config, meta={"proxy": proxy})
                     return result
                 except CurlError as e:  # pragma: no cover
                     if attempt < max_retries - 1:
diff --git a/scrapling/engines/toolbelt/convertor.py b/scrapling/engines/toolbelt/convertor.py
@@ -85,6 +85,7 @@ def from_playwright_response(
         first_response: SyncResponse,
         final_response: Optional[SyncResponse],
         parser_arguments: Dict,
+        meta: Optional[Dict] = None,
     ) -> Response:
         """
         Transforms a Playwright response into an internal `Response` object, encapsulating
@@ -134,6 +135,7 @@ def from_playwright_response(
                 "headers": first_response.all_headers(),
                 "request_headers": first_response.request.all_headers(),
                 "history": history,
+                "meta": meta,
                 **parser_arguments,
             }
         )
@@ -220,6 +222,7 @@ async def from_async_playwright_response(
         first_response: AsyncResponse,
         final_response: Optional[AsyncResponse],
         parser_arguments: Dict,
+        meta: Optional[Dict] = None,
     ) -> Response:
         """
         Transforms a Playwright response into an internal `Response` object, encapsulating
@@ -269,16 +272,18 @@ async def from_async_playwright_response(
                 "headers": await first_response.all_headers(),
                 "request_headers": await first_response.request.all_headers(),
                 "history": history,
+                "meta": meta,
                 **parser_arguments,
             }
         )
 
     @staticmethod
-    def from_http_request(response: CurlResponse, parser_arguments: Dict) -> Response:
+    def from_http_request(response: CurlResponse, parser_arguments: Dict, meta: Optional[Dict] = None) -> Response:
         """Takes `curl_cffi` response and generates `Response` object from it.
 
         :param response: `curl_cffi` response object
         :param parser_arguments: Additional arguments to be passed to the `Response` object constructor.
+        :param meta: Optional metadata dictionary to attach to the Response.
         :return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         return Response(
@@ -293,6 +298,7 @@ def from_http_request(response: CurlResponse, parser_arguments: Dict) -> Respons
                 "request_headers": dict(response.request.headers) if response.request else {},
                 "method": response.request.method if response.request else "GET",
                 "history": response.history,  # https://github.com/lexiforest/curl_cffi/issues/82
+                "meta": meta,
                 **parser_arguments,
             }
         )
diff --git a/scrapling/engines/toolbelt/custom.py b/scrapling/engines/toolbelt/custom.py
@@ -39,6 +39,7 @@ def __init__(
         encoding: str = "utf-8",
         method: str = "GET",
         history: List | None = None,
+        meta: Dict[str, Any] | None = None,
         **selector_config: Any,
     ):
         adaptive_domain: str = cast(str, selector_config.pop("adaptive_domain", ""))
@@ -57,7 +58,10 @@ def __init__(
         # For easier debugging while working from a Python shell
         log.info(f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})")
 
-        self.meta: Dict[str, Any] = {}
+        if meta and not isinstance(meta, dict):
+            raise TypeError(f"Response meta should be dictionary but got {type(meta).__name__} instead!")
+
+        self.meta: Dict[str, Any] = meta or {}
         self.request: Optional["Request"] = None  # Will be set by crawler
 
     def follow(
diff --git a/scrapling/spiders/engine.py b/scrapling/spiders/engine.py
@@ -113,6 +113,9 @@ async def _process_request(self, request: Request) -> None:
                 retry_request._retry_count += 1
                 retry_request.priority -= 1  # Don't retry immediately
                 retry_request.dont_filter = True
+                retry_request._session_kwargs.pop("proxy", None)
+                retry_request._session_kwargs.pop("proxies", None)
+
                 new_request = await self.spider.retry_blocked_request(retry_request, response)
                 self._normalize_request(new_request)
                 await self.scheduler.enqueue(new_request)
diff --git a/scrapling/spiders/session.py b/scrapling/spiders/session.py
@@ -124,7 +124,8 @@ async def fetch(self, request: Request) -> Response:
                 response = await session.fetch(url=request.url, **request._session_kwargs)
 
             response.request = request
-            response.meta = request.meta
+            # Merge request meta into response meta (response meta takes priority)
+            response.meta = {**request.meta, **response.meta}
             return response
         raise RuntimeError("No session found with the request session id")