Skip to content

Commit 38ded5f

Browse files
authored
Merge pull request #792 from flairNLP/add-tag-filter
Add tag filter to text extraction
2 parents fdf30c8 + d46e231 commit 38ded5f

File tree

2 files changed

+20
-12
lines changed

2 files changed

+20
-12
lines changed

src/fundus/parser/utility.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,13 +76,17 @@ class Node:
7676
# one could replace this recursion with XPath using an expression like this:
7777
# //*[not(self::script) and text()]/text(), but for whatever reason, that's actually 50-150% slower
7878
# than simply using the implemented mixture below
79-
def text_content(self, excluded_tags: Optional[List[str]] = None) -> str:
79+
def text_content(self, excluded_tags: Optional[List[str]] = None, tag_filter: Optional[XPath] = None) -> str:
8080
guarded_excluded_tags: List[str] = excluded_tags or []
8181

8282
def _text_content(element: lxml.html.HtmlElement) -> str:
83-
if element.tag in guarded_excluded_tags:
83+
if (
84+
element.tag in guarded_excluded_tags
85+
or isinstance(element, lxml.html.HtmlComment)
86+
or (tag_filter and tag_filter(element))
87+
):
8488
return element.tail or ""
85-
text = element.text or "" if not isinstance(element, lxml.html.HtmlComment) else ""
89+
text = element.text or ""
8690
children = "".join([_text_content(child) for child in element.iterchildren()])
8791
tail = element.tail or ""
8892
return text + children + tail
@@ -133,6 +137,7 @@ def extract_article_body_with_selector(
133137
paragraph_selector: XPath,
134138
summary_selector: Optional[XPath] = None,
135139
subheadline_selector: Optional[XPath] = None,
140+
tag_filter: Optional[XPath] = None,
136141
) -> ArticleBody:
137142
# depth first index for each element in tree
138143
df_idx_by_ref = {element: i for i, element in enumerate(doc.iter())}
@@ -164,14 +169,22 @@ def extract_nodes(selector: XPath, node_type: Type[Node]) -> List[Node]:
164169
instructions = itertools.chain([first, []], instructions)
165170

166171
summary = TextSequence(
167-
map(lambda x: normalize_whitespace(x.text_content(excluded_tags=["script"])), next(instructions))
172+
map(
173+
lambda x: normalize_whitespace(x.text_content(excluded_tags=["script"], tag_filter=tag_filter)),
174+
next(instructions),
175+
)
168176
)
169177
sections: List[ArticleSection] = []
170178

171179
for chunk in more_itertools.chunked(instructions, 2):
172180
if len(chunk) == 1:
173181
chunk.append([])
174-
texts = [list(map(lambda x: normalize_whitespace(x.text_content(excluded_tags=["script"])), c)) for c in chunk]
182+
texts = [
183+
list(
184+
map(lambda x: normalize_whitespace(x.text_content(excluded_tags=["script"], tag_filter=tag_filter)), c)
185+
)
186+
for c in chunk
187+
]
175188
sections.append(ArticleSection(*map(TextSequence, texts)))
176189

177190
return ArticleBody(summary=summary, sections=sections)

src/fundus/publishers/de/golem.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,18 +30,13 @@ class V1(BaseParser):
3030

3131
@attribute
3232
def body(self) -> Optional[ArticleBody]:
33-
body = extract_article_body_with_selector(
33+
return extract_article_body_with_selector(
3434
self.precomputed.doc,
3535
summary_selector=self._summary_selector,
3636
paragraph_selector=self._paragraph_selector,
3737
subheadline_selector=self._subheadline_selector,
38+
tag_filter=XPath("self::*[@class='go-vh']"),
3839
)
39-
for section in body.sections:
40-
filtered_sentences: List[str] = []
41-
for sentence in section.paragraphs:
42-
filtered_sentences.append(sentence.replace("(öffnet im neuen Fenster)", "").strip())
43-
section.paragraphs = TextSequence(filtered_sentences)
44-
return body
4540

4641
@attribute
4742
def authors(self) -> List[str]:

0 commit comments

Comments
 (0)