Skip to content

Commit b6e2770

Browse files
committed
Implement PR feedback for Stuttgarter Zeitung parser
1 parent db4113a commit b6e2770

File tree

1 file changed

+18
-14
lines changed

1 file changed

+18
-14
lines changed
Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
import datetime
22
from typing import List, Optional
33

4-
from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
5-
from fundus.parser.data import ArticleSection, TextSequence
4+
from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
65
from fundus.parser.utility import (
76
extract_article_body_with_selector,
87
generic_author_parsing,
98
generic_date_parsing,
109
generic_topic_parsing,
11-
normalize_whitespace,
10+
image_extraction,
1211
)
1312
from lxml.cssselect import CSSSelector
13+
from lxml.etree import XPath
1414

1515

1616
class StuttgarterZeitungParser(ParserProxy):
@@ -20,17 +20,11 @@ class V1(BaseParser):
2020

2121
@attribute
2222
def body(self) -> ArticleBody:
23-
summary_text = self.precomputed.ld.bf_search("description")
24-
summary = TextSequence([summary_text]) if summary_text else TextSequence([])
25-
26-
paragraph_elements = self._paragraph_selector(self.precomputed.doc)
27-
paragraph_texts = [normalize_whitespace(elem.text_content()) for elem in paragraph_elements]
28-
29-
subheadline_elements = self._subheadline_selector(self.precomputed.doc)
30-
31-
sections = [ArticleSection(headline=TextSequence([]), paragraphs=TextSequence(paragraph_texts))]
32-
33-
return ArticleBody(summary=summary, sections=sections)
23+
return extract_article_body_with_selector(
24+
self.precomputed.doc,
25+
paragraph_selector=self._paragraph_selector,
26+
subheadline_selector=self._subheadline_selector,
27+
)
3428

3529
@attribute
3630
def publishing_date(self) -> Optional[datetime.datetime]:
@@ -47,3 +41,13 @@ def title(self) -> Optional[str]:
4741
@attribute
4842
def topics(self) -> List[str]:
4943
return generic_topic_parsing(self.precomputed.ld.bf_search("keywords"))
44+
45+
@attribute
46+
def images(self) -> List[Image]:
47+
return image_extraction(
48+
doc=self.precomputed.doc,
49+
paragraph_selector=self._paragraph_selector,
50+
image_selector=XPath("//figure//img"),
51+
caption_selector=XPath("./ancestor::figure//figcaption"),
52+
relative_urls=True,
53+
)

0 commit comments

Comments
 (0)