11import datetime
22from typing import List , Optional
33
4- from fundus .parser import ArticleBody , BaseParser , ParserProxy , attribute
5- from fundus .parser .data import ArticleSection , TextSequence
4+ from fundus .parser import ArticleBody , BaseParser , Image , ParserProxy , attribute
65from fundus .parser .utility import (
76 extract_article_body_with_selector ,
87 generic_author_parsing ,
98 generic_date_parsing ,
109 generic_topic_parsing ,
11- normalize_whitespace ,
10+ image_extraction ,
1211)
1312from lxml .cssselect import CSSSelector
13+ from lxml .etree import XPath
1414
1515
1616class StuttgarterZeitungParser (ParserProxy ):
@@ -20,17 +20,11 @@ class V1(BaseParser):
2020
2121 @attribute
2222 def body (self ) -> ArticleBody :
23- summary_text = self .precomputed .ld .bf_search ("description" )
24- summary = TextSequence ([summary_text ]) if summary_text else TextSequence ([])
25-
26- paragraph_elements = self ._paragraph_selector (self .precomputed .doc )
27- paragraph_texts = [normalize_whitespace (elem .text_content ()) for elem in paragraph_elements ]
28-
29- subheadline_elements = self ._subheadline_selector (self .precomputed .doc )
30-
31- sections = [ArticleSection (headline = TextSequence ([]), paragraphs = TextSequence (paragraph_texts ))]
32-
33- return ArticleBody (summary = summary , sections = sections )
23+ return extract_article_body_with_selector (
24+ self .precomputed .doc ,
25+ paragraph_selector = self ._paragraph_selector ,
26+ subheadline_selector = self ._subheadline_selector ,
27+ )
3428
3529 @attribute
3630 def publishing_date (self ) -> Optional [datetime .datetime ]:
@@ -47,3 +41,13 @@ def title(self) -> Optional[str]:
4741 @attribute
4842 def topics (self ) -> List [str ]:
4943 return generic_topic_parsing (self .precomputed .ld .bf_search ("keywords" ))
44+
45+ @attribute
46+ def images (self ) -> List [Image ]:
47+ return image_extraction (
48+ doc = self .precomputed .doc ,
49+ paragraph_selector = self ._paragraph_selector ,
50+ image_selector = XPath ("//figure//img" ),
51+ caption_selector = XPath ("./ancestor::figure//figcaption" ),
52+ relative_urls = True ,
53+ )
0 commit comments