@@ -133,7 +133,7 @@ def decode_body(
133133 content_type: The Content-Type header.
134134
135135 Returns:
136- The parsed HTML body, or None if an error occurred during processed .
136+ The parsed HTML body, or None if an error occurred during processing .
137137 """
138138 # If there's no body, nothing useful is going to be found.
139139 if not body :
@@ -158,9 +158,31 @@ def decode_body(
158158 # Create an HTML parser.
159159 parser = etree .HTMLParser (recover = True , encoding = encoding )
160160
161- # Attempt to parse the body. Returns None if the body was successfully
162- # parsed, but no tree was found.
163- return etree .fromstring (body , parser )
161+ # Attempt to parse the body. With `lxml` 6.0.0+, this will be an empty HTML
162+ # tree if the body was successfully parsed, but no tree was found. In
163+ # previous `lxml` versions, `etree.fromstring` would return `None` in that
164+ # case.
165+ html_tree = etree .fromstring (body , parser )
166+
167+ # Account for the above referenced case where `html_tree` is an HTML tree
168+ # with an empty body. If so, return None.
169+ if html_tree is not None and html_tree .tag == "html" :
170+ # If the tree has only a single <body> element and it's empty, then
171+ # return None.
172+ body_el = html_tree .find ("body" )
173+ if body_el is not None and len (html_tree ) == 1 :
174+ # Extract the content of the body tag as text.
175+ body_text = "" .join (cast (Iterable [str ], body_el .itertext ()))
176+
177+ # Strip any undecodable Unicode characters and whitespace.
178+ body_text = body_text .strip ("\ufffd " ).strip ()
179+
180+ # If there's no text left, and there were no child tags,
181+ # then we consider the <body> tag empty.
182+ if not body_text and len (body_el ) == 0 :
183+ return None
184+
185+ return html_tree
164186
165187
166188def _get_meta_tags (
0 commit comments