|
| 1 | +from abstract_api.bases import BaseService |
| 2 | +from abstract_api.exceptions import ResponseParseError |
| 3 | + |
| 4 | +from .web_scraping_response import WebScrapingResponse |
| 5 | + |
| 6 | + |
| 7 | +class WebScraping(BaseService): |
| 8 | + """AbstractAPI web scraping service. |
| 9 | +
|
| 10 | + Used to extract data from a given URL. |
| 11 | +
|
| 12 | + Attributes: |
| 13 | + _subdomain: Web scraping service subdomain. |
| 14 | + """ |
| 15 | + _subdomain: str = "scrape" |
| 16 | + |
| 17 | + def scrape( |
| 18 | + self, |
| 19 | + url: str, |
| 20 | + render_js: bool | None = None, |
| 21 | + block_ads: bool | None = None, |
| 22 | + proxy_country: str | None = None |
| 23 | + ) -> WebScrapingResponse: |
| 24 | + """Extracts data from the given URL. |
| 25 | +
|
| 26 | + Args: |
| 27 | + url: The URL to extract the data from. Note that this parameter |
| 28 | + should include the full HTTP Protocol (http:// or https://). |
| 29 | + If your URL has parameters, you should encode it. |
| 30 | + For example the & character would be encoded to %26. |
| 31 | + render_js: If True the request will render Javascript on the |
| 32 | + target site. Note that Javascript is rendered via a Google |
| 33 | + Chrome headless browser. Defaults to False. |
| 34 | + block_ads: If True the request will block any advertisements it |
| 35 | + can identify on the target site. Defaults to False. |
| 36 | + proxy_country: The country to make the request from. |
| 37 | + The country should be submitted in the two letter, |
| 38 | + ISO 3166-1 alpha-2 code. |
| 39 | +
|
| 40 | + Returns: |
| 41 | + WebScrapingResponse representing API call response. |
| 42 | + """ |
| 43 | + response = self._service_request( |
| 44 | + url=url, |
| 45 | + render_js=render_js, |
| 46 | + block_ads=block_ads, |
| 47 | + proxy_country=proxy_country |
| 48 | + ) |
| 49 | + |
| 50 | + try: |
| 51 | + web_scraping_response = WebScrapingResponse(response=response) |
| 52 | + except Exception as e: |
| 53 | + raise ResponseParseError( |
| 54 | + "Failed to parse response as WebScrapingResponse" |
| 55 | + ) from e |
| 56 | + |
| 57 | + return web_scraping_response |
0 commit comments