@@ -76,13 +76,17 @@ class Node:
7676 # one could replace this recursion with XPath using an expression like this:
7777 # //*[not(self::script) and text()]/text(), but for whatever reason, that's actually 50-150% slower
7878 # than simply using the implemented mixture below
79- def text_content (self , excluded_tags : Optional [List [str ]] = None ) -> str :
79+ def text_content (self , excluded_tags : Optional [List [str ]] = None , tag_filter : Optional [ XPath ] = None ) -> str :
8080 guarded_excluded_tags : List [str ] = excluded_tags or []
8181
8282 def _text_content (element : lxml .html .HtmlElement ) -> str :
83- if element .tag in guarded_excluded_tags :
83+ if (
84+ element .tag in guarded_excluded_tags
85+ or isinstance (element , lxml .html .HtmlComment )
86+ or (tag_filter and tag_filter (element ))
87+ ):
8488 return element .tail or ""
85- text = element .text or "" if not isinstance ( element , lxml . html . HtmlComment ) else ""
89+ text = element .text or ""
8690 children = "" .join ([_text_content (child ) for child in element .iterchildren ()])
8791 tail = element .tail or ""
8892 return text + children + tail
@@ -133,6 +137,7 @@ def extract_article_body_with_selector(
133137 paragraph_selector : XPath ,
134138 summary_selector : Optional [XPath ] = None ,
135139 subheadline_selector : Optional [XPath ] = None ,
140+ tag_filter : Optional [XPath ] = None ,
136141) -> ArticleBody :
137142 # depth first index for each element in tree
138143 df_idx_by_ref = {element : i for i , element in enumerate (doc .iter ())}
@@ -164,14 +169,22 @@ def extract_nodes(selector: XPath, node_type: Type[Node]) -> List[Node]:
164169 instructions = itertools .chain ([first , []], instructions )
165170
166171 summary = TextSequence (
167- map (lambda x : normalize_whitespace (x .text_content (excluded_tags = ["script" ])), next (instructions ))
172+ map (
173+ lambda x : normalize_whitespace (x .text_content (excluded_tags = ["script" ], tag_filter = tag_filter )),
174+ next (instructions ),
175+ )
168176 )
169177 sections : List [ArticleSection ] = []
170178
171179 for chunk in more_itertools .chunked (instructions , 2 ):
172180 if len (chunk ) == 1 :
173181 chunk .append ([])
174- texts = [list (map (lambda x : normalize_whitespace (x .text_content (excluded_tags = ["script" ])), c )) for c in chunk ]
182+ texts = [
183+ list (
184+ map (lambda x : normalize_whitespace (x .text_content (excluded_tags = ["script" ], tag_filter = tag_filter )), c )
185+ )
186+ for c in chunk
187+ ]
175188 sections .append (ArticleSection (* map (TextSequence , texts )))
176189
177190 return ArticleBody (summary = summary , sections = sections )
0 commit comments