Skip to content

Commit 2d3d62f

Browse files
authored
Add website scraping service (#18)
1 parent c533b27 commit 2d3d62f

File tree

4 files changed

+73
-0
lines changed

4 files changed

+73
-0
lines changed

src/abstract_api/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from .phone_validation import PhoneValidation
1111
from .timezone import Timezone
1212
from .vat import VAT
13+
from .web_scraping import WebScraping
1314
from .website_screenshot import WebsiteScreenshot
1415

1516
__all__: Final[list[str]] = [
@@ -23,5 +24,6 @@
2324
"PhoneValidation",
2425
"Timezone",
2526
"VAT",
27+
"WebScraping",
2628
"WebsiteScreenshot"
2729
]
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from typing import Final
2+
3+
from .web_scraping import WebScraping
4+
from .web_scraping_response import WebScrapingResponse
5+
6+
__all__: Final[list[str]] = [
7+
"WebScraping",
8+
"WebScrapingResponse"
9+
]
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
from abstract_api.bases import BaseService
2+
from abstract_api.exceptions import ResponseParseError
3+
4+
from .web_scraping_response import WebScrapingResponse
5+
6+
7+
class WebScraping(BaseService):
8+
"""AbstractAPI web scraping service.
9+
10+
Used to extract data from a given URL.
11+
12+
Attributes:
13+
_subdomain: Web scraping service subdomain.
14+
"""
15+
_subdomain: str = "scrape"
16+
17+
def scrape(
18+
self,
19+
url: str,
20+
render_js: bool | None = None,
21+
block_ads: bool | None = None,
22+
proxy_country: str | None = None
23+
) -> WebScrapingResponse:
24+
"""Extracts data from the given URL.
25+
26+
Args:
27+
url: The URL to extract the data from. Note that this parameter
28+
should include the full HTTP Protocol (http:// or https://).
29+
If your URL has parameters, you should encode it.
30+
For example the & character would be encoded to %26.
31+
render_js: If True the request will render Javascript on the
32+
target site. Note that Javascript is rendered via a Google
33+
Chrome headless browser. Defaults to False.
34+
block_ads: If True the request will block any advertisements it
35+
can identify on the target site. Defaults to False.
36+
proxy_country: The country to make the request from.
37+
The country should be submitted in the two letter,
38+
ISO 3166-1 alpha-2 code.
39+
40+
Returns:
41+
WebScrapingResponse representing API call response.
42+
"""
43+
response = self._service_request(
44+
url=url,
45+
render_js=render_js,
46+
block_ads=block_ads,
47+
proxy_country=proxy_country
48+
)
49+
50+
try:
51+
web_scraping_response = WebScrapingResponse(response=response)
52+
except Exception as e:
53+
raise ResponseParseError(
54+
"Failed to parse response as WebScrapingResponse"
55+
) from e
56+
57+
return web_scraping_response
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from abstract_api.bases import FileResponse
2+
3+
4+
class WebScrapingResponse(FileResponse):
5+
"""Web scraping service response."""

0 commit comments

Comments
 (0)