@@ -86,7 +86,8 @@ class Document:
8686 """Class to build a etree document out of html."""
8787
8888 def __init__ (self , input , positive_keywords = None , negative_keywords = None ,
89- url = None , min_text_length = 25 , retry_length = 250 , xpath = False ):
89+ url = None , min_text_length = 25 , retry_length = 250 , xpath = False ,
90+ handle_failures = 'discard' ):
9091 """Generate the document
9192
9293 :param input: string of the html content.
@@ -97,6 +98,8 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None,
9798 :param xpath: If set to True, adds x="..." attribute to each HTML node,
9899 containing xpath path pointing to original document path (allows to
99100 reconstruct selected summary in original document).
101+ :param handle_failures: Parameter passed to `lxml` for handling failure during exception.
102+ Support options = ["discard", "ignore", None]
100103
101104 Examples:
102105 positive_keywords=["news-item", "block"]
@@ -122,6 +125,7 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None,
122125 self .min_text_length = min_text_length
123126 self .retry_length = retry_length
124127 self .xpath = xpath
128+ self .handle_failures = handle_failures
125129
126130 def _html (self , force = False ):
127131 if force or self .html is None :
@@ -141,13 +145,13 @@ def _parse(self, input):
141145 # trying to guard against bad links like <a href="http://[http://...">
142146 try :
143147 # such support is added in lxml 3.3.0
144- doc .make_links_absolute (base_href , resolve_base_href = True , handle_failures = 'discard' )
148+ doc .make_links_absolute (base_href , resolve_base_href = True , handle_failures = self . handle_failures )
145149 except TypeError : #make_links_absolute() got an unexpected keyword argument 'handle_failures'
146150 # then we have lxml < 3.3.0
147151 # please upgrade to lxml >= 3.3.0 if you're failing here!
148- doc .make_links_absolute (base_href , resolve_base_href = True )
152+ doc .make_links_absolute (base_href , resolve_base_href = True , handle_failures = self . handle_failures )
149153 else :
150- doc .resolve_base_href ()
154+ doc .resolve_base_href (handle_failures = self . handle_failures )
151155 return doc
152156
153157 def content (self ):
0 commit comments