Update URL Preview code to work with lxml 6.0.0 (#18622)

anoadragon453 · web-flow · commit 5f027adb33bc · 2025-07-15T15:04:29.000+01:00
diff --git a/changelog.d/18622.misc b/changelog.d/18622.misc
@@ -0,0 +1 @@
+Update URL Preview code to work with `lxml` 6.0.0+.
diff --git a/synapse/media/preview_html.py b/synapse/media/preview_html.py
@@ -133,7 +133,7 @@ def decode_body(
         content_type: The Content-Type header.
 
     Returns:
-        The parsed HTML body, or None if an error occurred during processed.
+        The parsed HTML body, or None if an error occurred during processing.
     """
     # If there's no body, nothing useful is going to be found.
     if not body:
@@ -158,9 +158,31 @@ def decode_body(
     # Create an HTML parser.
     parser = etree.HTMLParser(recover=True, encoding=encoding)
 
-    # Attempt to parse the body. Returns None if the body was successfully
-    # parsed, but no tree was found.
-    return etree.fromstring(body, parser)
+    # Attempt to parse the body. With `lxml` 6.0.0+, this will be an empty HTML
+    # tree if the body was successfully parsed, but no tree was found. In
+    # previous `lxml` versions, `etree.fromstring` would return `None` in that
+    # case.
+    html_tree = etree.fromstring(body, parser)
+
+    # Account for the above referenced case where `html_tree` is an HTML tree
+    # with an empty body. If so, return None.
+    if html_tree is not None and html_tree.tag == "html":
+        # If the tree has only a single <body> element and it's empty, then
+        # return None.
+        body_el = html_tree.find("body")
+        if body_el is not None and len(html_tree) == 1:
+            # Extract the content of the body tag as text.
+            body_text = "".join(cast(Iterable[str], body_el.itertext()))
+
+            # Strip any undecodable Unicode characters and whitespace.
+            body_text = body_text.strip("\ufffd").strip()
+
+            # If there's no text left, and there were no child tags,
+            # then we consider the <body> tag empty.
+            if not body_text and len(body_el) == 0:
+                return None
+
+    return html_tree
 
 
 def _get_meta_tags(

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Update URL Preview code to work with `lxml` 6.0.0+.