Skip to content

Commit 187f6b0

Browse files
authored
Merge pull request #28 from fast-crawler/feature/engine
✨ Add AioHTTP and Deprecated playwright from Engine
2 parents 103f61e + 567e084 commit 187f6b0

File tree

13 files changed

+506
-250
lines changed

13 files changed

+506
-250
lines changed

fastcrawler/engine/__init__.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
from .aio import AioHTTP
2-
from .base import ProxySetting
3-
from .playwright import Playwright
2+
from .base import ProxySetting, SetCookieParam
43

54
__all__ = [
6-
"Playwright",
75
"ProxySetting",
8-
"AioHTTP"
6+
"SetCookieParam",
7+
"AioHTTP",
98
]

fastcrawler/engine/aio.py

Lines changed: 122 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,124 @@
1-
from fastcrawler.engine.base import EngineProto
1+
import asyncio
22

3+
import pydantic
4+
from aiohttp import BasicAuth, ClientSession, TCPConnector
5+
from aiohttp.cookiejar import Morsel
36

4-
class AioHTTP(EngineProto):
5-
...
7+
from fastcrawler.engine.base import ProxySetting, SetCookieParam
8+
9+
10+
class AioHTTP:
11+
def __init__(
12+
self,
13+
cookies: list[SetCookieParam] | None = None,
14+
headers: dict | None = None,
15+
useragent: str | None = None,
16+
proxy: ProxySetting | None = None,
17+
connection_limit: int = 100,
18+
):
19+
"""Initialize a new engine instance with given cookie, header, useragent, and proxy"""
20+
self.session = None
21+
self._cookies = (
22+
[(cookie.name, self.get_morsel_cookie(cookie)) for cookie in cookies]
23+
if cookies is not None
24+
else None
25+
)
26+
27+
self._headers = headers or {}
28+
if useragent:
29+
self._headers["User-Agent"] = useragent
30+
31+
self._connector = TCPConnector(limit_per_host=connection_limit)
32+
33+
self._proxy = {}
34+
if proxy:
35+
proxy_url = f"{proxy.protocol}{proxy.server}:{proxy.port}"
36+
self._proxy["proxy"] = proxy_url
37+
if proxy.username and proxy.password:
38+
auth = BasicAuth(login=proxy.username, password=proxy.password)
39+
self._proxy["proxy_auth"] = auth
40+
41+
@property
42+
def cookies(self):
43+
return self._cookies
44+
45+
@property
46+
def headers(self):
47+
return self._headers
48+
49+
@property
50+
def proxy(self):
51+
return self._proxy
52+
53+
def get_morsel_cookie(self, cookie: SetCookieParam) -> Morsel:
54+
"""Converts a SetCookieParam object to an Morsel object."""
55+
morsel_obj = Morsel()
56+
morsel_obj.set(cookie.name, cookie.value, cookie.value)
57+
morsel_obj.update(
58+
dict(
59+
domain=cookie.domain,
60+
path=cookie.path,
61+
expires=cookie.expires,
62+
secure=cookie.secure,
63+
httponly=cookie.httpOnly,
64+
samesite=cookie.sameSite,
65+
)
66+
)
67+
return morsel_obj
68+
69+
async def __aenter__(self):
70+
"""Async context manager support for engine -> ENTER"""
71+
await self.setup()
72+
return self
73+
74+
async def __aexit__(self, exc_type, exc_val, exc_tb):
75+
"""Async context manager support for engine -> EXIT"""
76+
await self.teardown()
77+
78+
async def setup(self, **kwargs) -> None:
79+
"""Set-up up the engine for crawling purpose."""
80+
self.session = ClientSession(
81+
connector=self._connector,
82+
cookies=self.cookies,
83+
headers=self.headers,
84+
trust_env=True,
85+
**kwargs,
86+
)
87+
88+
async def teardown(self) -> None:
89+
"""Cleans up the engine."""
90+
await self.session.close()
91+
92+
async def base(self, url: pydantic.AnyUrl, method: str, data: dict, **kwargs) -> str:
93+
"""Base Method for protocol to retrieve a list of URL."""
94+
95+
async with self.session.request(
96+
method, url, data=data, headers=self.headers, **self.proxy, **kwargs
97+
) as response:
98+
return await response.text()
99+
100+
async def get(self, urls: list[pydantic.AnyUrl], **kwargs) -> list[str] | str:
101+
"""GET HTTP Method for protocol to retrieve a list of URL."""
102+
tasks = [self.base(url, "GET", None, **kwargs) for url in urls]
103+
return await asyncio.gather(*tasks)
104+
105+
async def post(
106+
self, urls: list[pydantic.AnyUrl], datas: list[dict], **kwargs
107+
) -> list[str] | str:
108+
"""POST HTTP Method for protocol to crawl a list of URL."""
109+
tasks = [self.base(url, "POST", data=data, **kwargs) for url, data in zip(urls, datas)]
110+
return await asyncio.gather(*tasks)
111+
112+
async def put(
113+
self, urls: list[pydantic.AnyUrl], datas: list[dict], **kwargs
114+
) -> list[str] | str:
115+
"""PUT HTTP Method for protocol to crawl a list of URL."""
116+
tasks = [self.base(url, "PUT", data=data) for url, data in zip(urls, datas)]
117+
return await asyncio.gather(*tasks)
118+
119+
async def delete(
120+
self, urls: list[pydantic.AnyUrl], datas: list[dict], **kwargs
121+
) -> list[str] | str:
122+
"""DELETE HTTP Method for protocol to crawl a list of URL."""
123+
tasks = [self.base(url, "DELETE", data=data, **kwargs) for url, data in zip(urls, datas)]
124+
return await asyncio.gather(*tasks)

fastcrawler/engine/base.py

Lines changed: 37 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1+
# pragma: no cover
12
# pylint: disable=pointless-string-statement
23

3-
from typing import List, Literal, Protocol
4+
from typing import Literal, Protocol
45

56
import pydantic
67

@@ -9,7 +10,7 @@ class SetCookieParam(pydantic.BaseModel):
910
name: str
1011
value: str
1112
url: str | None = None
12-
domain: str | None = None
13+
domain: str = ""
1314
path: str | None = None
1415
expires: float | None = None
1516
httpOnly: bool | None = None
@@ -27,35 +28,37 @@ class ProxySetting(pydantic.BaseModel):
2728

2829
class EngineProto(Protocol):
2930
def __init__(
30-
self, cookie: List[dict] | None, header: dict | None,
31-
useragent: dict | None, proxy: ProxySetting | None
32-
): ...
33-
"""Initialize a new engine instance with given cookie, header, useragent, and proxy
34-
"""
35-
async def __aenter__(self): ...
36-
"""Async context manager support for engine -> ENTER
37-
"""
38-
async def __aexit__(self, exc_type, exc_val, exc_tb): ...
39-
"""Async context manager support for engine -> EXIT
40-
"""
41-
async def setup(self) -> None: ...
42-
"""Set-up up the engine for crawling purpose.
43-
"""
44-
async def teardown(self) -> None: ...
45-
"""Cleans up the engine.
46-
"""
47-
async def base(self, url: pydantic.AnyUrl, method: str, data: dict) -> str: ...
48-
"""Base Method for protocol to retrieve a list of URL.
49-
"""
50-
async def get(self, urls: List[pydantic.AnyUrl]) -> str: ...
51-
"""GET HTTP Method for protocol to retrieve a list of URL.
52-
"""
53-
async def post(self, urls: List[pydantic.AnyUrl], datas: List[dict]) -> str: ...
54-
"""POST HTTP Method for protocol to crawl a list of URL.
55-
"""
56-
async def put(self, urls: List[pydantic.AnyUrl], datas: List[dict]) -> str: ...
57-
"""POST HTTP Method for protocol to crawl a list of URL.
58-
"""
59-
async def delete(self, urls: List[pydantic.AnyUrl], datas: List[dict]) -> str: ...
60-
"""DELETE HTTP Method for protocol to crawl a list of URL.
61-
"""
31+
self,
32+
cookies: list[SetCookieParam] | None,
33+
headers: dict | None,
34+
useragent: str | None,
35+
proxy: ProxySetting | None,
36+
):
37+
"""Initialize a new engine instance with given cookie(s), header(s), useragent, and proxy"""
38+
39+
async def __aenter__(self):
40+
"""Async context manager support for engine -> ENTER"""
41+
42+
async def __aexit__(self, exc_type, exc_val, exc_tb):
43+
"""Async context manager support for engine -> EXIT"""
44+
45+
async def setup(self) -> None:
46+
"""Set-up up the engine for crawling purpose."""
47+
48+
async def teardown(self) -> None:
49+
"""Cleans up the engine."""
50+
51+
async def base(self, url: pydantic.AnyUrl, method: str, data: dict) -> str:
52+
"""Base Method for protocol to retrieve a list of URL."""
53+
54+
async def get(self, urls: list[pydantic.AnyUrl]) -> str:
55+
"""GET HTTP Method for protocol to retrieve a list of URL."""
56+
57+
async def post(self, urls: list[pydantic.AnyUrl], datas: list[dict]) -> str:
58+
"""POST HTTP Method for protocol to crawl a list of URL."""
59+
60+
async def put(self, urls: list[pydantic.AnyUrl], datas: list[dict]) -> str:
61+
"""POST HTTP Method for protocol to crawl a list of URL."""
62+
63+
async def delete(self, urls: list[pydantic.AnyUrl], datas: list[dict]) -> str:
64+
"""DELETE HTTP Method for protocol to crawl a list of URL."""

fastcrawler/engine/playwright.py

Lines changed: 0 additions & 130 deletions
This file was deleted.

requirements-dev.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,12 @@
11
pytest
2+
pytest-asyncio
3+
cssselect==1.2.0
24
pytest-cov
5+
pydantic_core==2.1.2
6+
pydantic_settings
7+
lxml
8+
aiohttp
9+
fastapi
10+
uvicorn
11+
colorama
12+
pyyaml

requirements/test.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
-r base.txt
1+
-r base.txt
2+
pytest
3+
asyncio-pytest

0 commit comments

Comments
 (0)