1- from typing import List , Type
1+ from typing import Type
22
33from pydantic import ValidationError
44from pydantic_core import Url
55
6- from fastcrawler .exceptions import (ParserInvalidModelType ,
7- ParserValidationError )
6+ from fastcrawler .exceptions import ParserInvalidModelType , ParserValidationError
87
9- from .base import ParserProtocol
108from .pydantic import BaseModel , BaseModelType , URLs
11- from .selectors .base import BaseSelector
12- from .utils import get_inner_model , get_selector
9+ from .selectors .base import BaseSelector , get_selector
10+ from .utils import get_inner_model
1311
1412
15- class HTMLParser ( ParserProtocol ) :
13+ class HTMLParser :
1614 """
1715 HTMLParser first initiate the scraped data, then it parses a given HTML document
1816 based on the specified model. Using Pydantic model with XPATHField or CSSField.
@@ -27,6 +25,7 @@ class HTMLParser(ParserProtocol):
2725 # parse it later!
2826 html_parser.parse(a pydantic model built with XPATHField or CSSField)
2927 """
28+
3029 def __init__ (self , scraped_data : str ):
3130 """
3231 Initiate the HTML file in memory, so it can be parsed later
@@ -40,36 +39,33 @@ def parse(self, model: Type[BaseModelType]) -> BaseModelType:
4039 """
4140 Parse using the pydantic model
4241 """
43- if hasattr (model , "__mro__" ) and BaseModel in model . __mro__ : # type: ignore
42+ if issubclass (model , BaseModel ) : # type: ignore
4443 data = {}
4544 for field_name , field in model .model_fields .items ():
46- field_selector = get_selector (field )
47- if field_selector :
48- data [field_name ] = field_selector .resolve (
45+ fastcrawler_selector = get_selector (field )
46+ if fastcrawler_selector :
47+ data [field_name ] = fastcrawler_selector .resolve (
4948 scraped_data = self .scraped_data ,
50- model = get_inner_model (model , field_name )
49+ model = get_inner_model (
50+ model , field_name
51+ ), # TODO: check if pydantic returns the model data type
5152 )
5253
5354 if hasattr (
54- model .Config , "url_resolver" ,
55+ model .Config ,
56+ "url_resolver" ,
5557 ) and issubclass (model .Config .url_resolver .__class__ , BaseSelector ):
56- urls : List [Url ] = model .Config .url_resolver .resolve ( # type: ignore
58+ urls : list [Url ] = model .Config .url_resolver .resolve ( # type: ignore
5759 self .scraped_data ,
58- model = None
60+ model = None ,
5961 )
60- if urls :
61- self .resolver = URLs (
62- urls = urls
63- )
64- else :
65- self .resolver = URLs ()
62+ self .resolver = URLs (urls = urls or [])
6663
6764 try :
6865 self .data : BaseModelType | None = model .model_validate (data )
66+ return self .data
6967 except ValidationError as error :
7068 raise ParserValidationError (error .errors ()) from error
7169
72- return self .data
73-
7470 else :
7571 raise ParserInvalidModelType (model = model )
0 commit comments