Skip to content

Commit 103f61e

Browse files
authored
✨ Add Regex compiler & Deprecate selectolax (#26)
* ⚡️ regex performance * ⚡️ Improve duck-typing on the processor * ✨ add _UNSET improving code readability * ✏️ fix typos * ♻️ refactor processor's default configuration * ♻️ Refactor default configuration on selectors * ✏️ fix typo mistakes * ✅ add tests for selectolax * ✏️ fix: typo mistake: List[...] -> list[...] * 🗑️ deprecate selectolax for immaturity * 🎨 Apply format with black * perf: remove inheriting from base protocol --------- Co-authored-by: Sadegh Yazdani
1 parent deee8b4 commit 103f61e

File tree

18 files changed

+432
-222
lines changed

18 files changed

+432
-222
lines changed

fastcrawler/core/app.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77

88
class FastCrawler:
9-
""" The client interface to start all crawlers.
9+
"""The client interface to start all crawlers.
1010
Initilize all crawlers
1111
1212
@@ -18,12 +18,15 @@ class FastCrawler:
1818
app.start()
1919
2020
"""
21+
2122
crawlers: List[Crawler]
2223

2324
def __init__(self, crawlers: List[Crawler] | Crawler):
24-
""" Initilize FastCrawler with defined crawlers"""
25+
"""Initilize FastCrawler with defined crawlers"""
2526
if isinstance(crawlers, Crawler):
26-
self.crawlers = [crawlers, ]
27+
self.crawlers = [
28+
crawlers,
29+
]
2730
else:
2831
self.crawlers = crawlers
2932

fastcrawler/exceptions.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ class BaseModelError(Exception):
44

55
class NoCrawlerFoundError(BaseModelError):
66
"""No crawler is found in starting application"""
7+
78
def __init__(self):
89
super().__init__(self, self.__doc__)
910

@@ -23,3 +24,16 @@ def __init__(self, model):
2324
"\nfrom fastcrawler import BaseModel"
2425
)
2526
super().__init__(self.message)
27+
28+
29+
class ProcessorNotSupported(BaseModelError):
30+
def __init__(self, model):
31+
self.model = model
32+
self.message = (
33+
f"The provided processor {model} is not supported.\n"
34+
"To support the process, please explictly map the processor"
35+
"inside the XPATH/CSS/Base selector, as a method called 'interface_mapper'"
36+
"\nWe support full duck typing which means you can inject whatever"
37+
"you need."
38+
)
39+
super().__init__(self.message)

fastcrawler/parsers/__init__.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,23 @@
11
from .html import HTMLParser
22
from .json import JsonParser
3+
from .processors.lxml import LxmlProcessor
4+
5+
# from .processors.modest import ModestProcessor
36
from .pydantic import BaseModel
47
from .selectors.css import CSSField
5-
from .selectors.xpath import XPATHField
68
from .selectors.regex import RegexField
9+
from .selectors.xpath import XPATHField
710

811
__all__ = [
12+
# Selectors
913
"XPATHField",
1014
"BaseModel",
1115
"CSSField",
1216
"RegexField",
13-
17+
# Parsers
1418
"JsonParser",
15-
"HTMLParser"
19+
"HTMLParser",
20+
# Processors
21+
"ModestProcessor",
22+
"LxmlProcessor",
1623
]

fastcrawler/parsers/base.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,13 @@
44

55

66
class ParserProtocol(Protocol):
7-
def __init__(self, scraped_data: Any): ...
8-
"""Initilize the parser with the given data (html/json/etc)"""
9-
def parse(self, model: Any) -> Any: ...
10-
"""
11-
Parse the saved data, with given model, which should be a pydantic model
12-
imported from fastcrawler library
13-
"""
7+
def __init__(self, scraped_data: Any):
8+
"""Initilize the parser with the given data (html/json/etc)"""
9+
...
10+
11+
def parse(self, model: Any) -> Any:
12+
"""
13+
Parse the saved data, with given model, which should be a pydantic model
14+
imported from fastcrawler library
15+
"""
16+
...

fastcrawler/parsers/html.py

Lines changed: 19 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,16 @@
1-
from typing import List, Type
1+
from typing import Type
22

33
from pydantic import ValidationError
44
from pydantic_core import Url
55

6-
from fastcrawler.exceptions import (ParserInvalidModelType,
7-
ParserValidationError)
6+
from fastcrawler.exceptions import ParserInvalidModelType, ParserValidationError
87

9-
from .base import ParserProtocol
108
from .pydantic import BaseModel, BaseModelType, URLs
11-
from .selectors.base import BaseSelector
12-
from .utils import get_inner_model, get_selector
9+
from .selectors.base import BaseSelector, get_selector
10+
from .utils import get_inner_model
1311

1412

15-
class HTMLParser(ParserProtocol):
13+
class HTMLParser:
1614
"""
1715
HTMLParser first initiate the scraped data, then it parses a given HTML document
1816
based on the specified model. Using Pydantic model with XPATHField or CSSField.
@@ -27,6 +25,7 @@ class HTMLParser(ParserProtocol):
2725
# parse it later!
2826
html_parser.parse(a pydantic model built with XPATHField or CSSField)
2927
"""
28+
3029
def __init__(self, scraped_data: str):
3130
"""
3231
Initiate the HTML file in memory, so it can be parsed later
@@ -40,36 +39,33 @@ def parse(self, model: Type[BaseModelType]) -> BaseModelType:
4039
"""
4140
Parse using the pydantic model
4241
"""
43-
if hasattr(model, "__mro__") and BaseModel in model.__mro__: # type: ignore
42+
if issubclass(model, BaseModel): # type: ignore
4443
data = {}
4544
for field_name, field in model.model_fields.items():
46-
field_selector = get_selector(field)
47-
if field_selector:
48-
data[field_name] = field_selector.resolve(
45+
fastcrawler_selector = get_selector(field)
46+
if fastcrawler_selector:
47+
data[field_name] = fastcrawler_selector.resolve(
4948
scraped_data=self.scraped_data,
50-
model=get_inner_model(model, field_name)
49+
model=get_inner_model(
50+
model, field_name
51+
), # TODO: check if pydantic returns the model data type
5152
)
5253

5354
if hasattr(
54-
model.Config, "url_resolver",
55+
model.Config,
56+
"url_resolver",
5557
) and issubclass(model.Config.url_resolver.__class__, BaseSelector):
56-
urls: List[Url] = model.Config.url_resolver.resolve( # type: ignore
58+
urls: list[Url] = model.Config.url_resolver.resolve( # type: ignore
5759
self.scraped_data,
58-
model=None
60+
model=None,
5961
)
60-
if urls:
61-
self.resolver = URLs(
62-
urls=urls
63-
)
64-
else:
65-
self.resolver = URLs()
62+
self.resolver = URLs(urls=urls or [])
6663

6764
try:
6865
self.data: BaseModelType | None = model.model_validate(data)
66+
return self.data
6967
except ValidationError as error:
7068
raise ParserValidationError(error.errors()) from error
7169

72-
return self.data
73-
7470
else:
7571
raise ParserInvalidModelType(model=model)

fastcrawler/parsers/json.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,11 @@
33
from pydantic import ValidationError
44
from pydantic_core import Url
55

6-
from fastcrawler.exceptions import (ParserInvalidModelType,
7-
ParserValidationError)
6+
from fastcrawler.exceptions import ParserInvalidModelType, ParserValidationError
87
from fastcrawler.parsers.pydantic import BaseModel, BaseModelType, URLs
98

10-
from .base import ParserProtocol
119

12-
13-
class JsonParser(ParserProtocol):
10+
class JsonParser:
1411
"""
1512
HTMLParser first initiate the scraped data, then it parses a given HTML document
1613
based on the specified model. Using Pydantic model with XPATHField or CSSField.
@@ -25,6 +22,7 @@ class JsonParser(ParserProtocol):
2522
# parse it later!
2623
html_parser.parse(a pydantic model built with XPATHField or CSSField)
2724
"""
25+
2826
data = None
2927

3028
def __init__(self, scraped_data: dict):
@@ -49,7 +47,11 @@ def parse(self, model: Type[BaseModelType]) -> BaseModelType:
4947
current_address: dict = self.scraped_data.copy()
5048
for address in model.Config.url_resolver.split("."):
5149
current_address = current_address.get(address) # type: ignore
52-
self.resolver = URLs(urls=[Url(current_address), ])
50+
self.resolver = URLs(
51+
urls=[
52+
Url(current_address),
53+
]
54+
)
5355
try:
5456
self.data = model.model_validate(self.data)
5557
except ValidationError as error:

fastcrawler/parsers/processors/base.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List, Protocol
1+
from typing import Protocol
22

33

44
class ElementInterface(Protocol):
@@ -35,16 +35,18 @@ def to_string(result: ElementInterface) -> str:
3535

3636
@staticmethod
3737
def from_string_by_xpath(
38-
string: str, query: str
39-
) -> List[ElementInterface] | ElementInterface | None:
38+
string: str,
39+
query: str,
40+
) -> list[ElementInterface] | ElementInterface | None:
4041
"""
4142
Resolves a HTML string by XPATH
4243
"""
4344

4445
@staticmethod
4546
def from_string_by_css(
46-
string: str, query: str
47-
) -> List[ElementInterface] | ElementInterface | None:
47+
string: str,
48+
query: str,
49+
) -> list[ElementInterface] | ElementInterface | None:
4850
"""
4951
Resolves a HTML string by CSS
5052
"""
Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,10 @@
11
# pylint: disable=c-extension-no-member
2-
from typing import List
32

43
from lxml import etree # type: ignore[attr-defined]
54
from lxml import html as lxml_html # type: ignore[attr-defined]
65

7-
from .base import ProcessorInterface
86

9-
10-
class LxmlProcessor(ProcessorInterface):
7+
class LxmlProcessor:
118
base_element = etree.ElementBase
129

1310
@staticmethod
@@ -20,22 +17,24 @@ def to_string(result: etree.ElementBase) -> str:
2017

2118
@staticmethod
2219
def from_string_by_xpath(
23-
string: str, query: str
24-
) -> etree.ElementBase | List[etree.ElementBase] | None:
20+
string: str,
21+
query: str,
22+
) -> etree.ElementBase | list[etree.ElementBase] | None:
2523
"""
2624
Resolves a HTML string by XPATH
2725
"""
2826
tree = lxml_html.fromstring(string)
29-
results: List[etree.ElementBase] = tree.xpath(query)
27+
results: list[etree.ElementBase] = tree.xpath(query)
3028
return results
3129

3230
@staticmethod
3331
def from_string_by_css(
34-
string: str, query: str
35-
) -> etree.ElementBase | List[etree.ElementBase] | None:
32+
string: str,
33+
query: str,
34+
) -> etree.ElementBase | list[etree.ElementBase] | None:
3635
"""
3736
Resolves a HTML string by CSS
3837
"""
3938
tree = lxml_html.fromstring(string)
40-
results: List[etree.ElementBase] = tree.cssselect(query)
39+
results: list[etree.ElementBase] = tree.cssselect(query)
4140
return results
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# NOTE: This library is not matured yet to be used
2+
3+
4+
# from selectolax.parser import HTMLParser, Node
5+
6+
# from .base import ElementInterface
7+
8+
9+
# class ModestProcessor:
10+
# base_element = Node
11+
12+
# @staticmethod
13+
# def to_string(result: Node) -> str:
14+
# """
15+
# Resolves a result to string, by getting the inner html,
16+
# This method is used to iterate over HTML elements to resolve inner pydantic models
17+
# """
18+
# return result.html
19+
20+
# @staticmethod
21+
# def from_string_by_xpath(
22+
# string: str, query: str
23+
# ) -> list[ElementInterface] | ElementInterface | None:
24+
# """
25+
# Resolves a HTML string by XPATH
26+
# """
27+
# raise NotImplementedError("XPATH is not supported in selectolax")
28+
29+
# @staticmethod
30+
# def from_string_by_css(
31+
# string: str, query: str
32+
# ) -> list[ElementInterface] | ElementInterface | None:
33+
# """
34+
# Resolves a HTML string by CSS
35+
# """
36+
# results = HTMLParser(string).css(query)
37+
# return results

fastcrawler/parsers/pydantic.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,33 @@
1-
from typing import TYPE_CHECKING, List, TypeVar, Union
1+
from typing import TYPE_CHECKING, TypeVar, Union
22

33
from pydantic import AnyUrl
44
from pydantic import BaseModel as _BaseModel
55

66
if TYPE_CHECKING:
7-
from fastcrawler.parsers.selectors.base import \
8-
BaseSelector # pragma: no cover
7+
from fastcrawler.parsers.selectors.base import BaseSelector # pragma: no cover
8+
9+
10+
class MappedAttr(_BaseModel):
11+
is_property: bool
12+
attr_name: str
13+
14+
15+
class MappedResult(_BaseModel):
16+
get: MappedAttr
17+
text: MappedAttr
918

1019

1120
class BaseModel(_BaseModel):
1221
"""
1322
Custom basemodel created from Pydantic :)
1423
"""
24+
1525
class Config:
1626
url_resolver: Union["BaseSelector", str]
1727

1828

1929
class URLs(BaseModel):
20-
urls: List[AnyUrl] = []
30+
urls: list[AnyUrl] = []
2131

2232

23-
BaseModelType = TypeVar('BaseModelType', bound=BaseModel)
33+
BaseModelType = TypeVar("BaseModelType", bound=BaseModel)

0 commit comments

Comments
 (0)