element-hq · clokep · Dec 10, 2025 · Dec 10, 2025 · Dec 10, 2025 · Dec 11, 2025
@@ -0,0 +1 @@
+Switch to beautofulsoup4 from lxml for URL previews. Controbuted by @clokep.
@@ -633,10 +633,6 @@ This is critical from a security perspective to stop arbitrary Matrix users
 spidering 'internal' URLs on your network. At the very least we recommend that
 your loopback and RFC1918 IP addresses are blacklisted.
 
-This also requires the optional `lxml` python dependency to be  installed. This
-in turn requires the `libxml2` library to be available - on  Debian/Ubuntu this
-means `apt-get install libxml2-dev`, or equivalent for your OS.
-
 ### Backups
 
 Don't forget to take [backups](../usage/administration/backups.md) of your new server!

@@ -134,7 +134,7 @@ oidc = ["authlib>=0.15.1"]
 # `systemd.journal.JournalHandler`, as is documented in
 # `contrib/systemd/log_config.yaml`.
 systemd = ["systemd-python>=231"]
-url-preview = ["lxml>=4.6.3"]
+url-preview = ["beautifulsoup4>=4.13.0"]
 sentry = ["sentry-sdk>=0.7.2"]
 opentracing = [
     "jaeger-client>=4.2.0",
@@ -177,7 +177,7 @@ all = [
     # oidc and jwt
     "authlib>=0.15.1",
     # url-preview
-    "lxml>=4.6.3",
+    "beautifulsoup4>=4.13.0",
     # sentry
     "sentry-sdk>=0.7.2",
     # opentracing
@@ -261,7 +261,6 @@ generate-setup-file = true
 ruff = "0.14.6"
 
 # Typechecking
-lxml-stubs = ">=0.4.0"
 mypy = "*"
 mypy-zope = "*"
 types-bleach = ">=4.1.0"
@@ -436,6 +435,7 @@ sdist-include = [
     "rust/build.rs",
     "rust/src/**",
 ]
+
 sdist-exclude = ["synapse/*.so"]
 
 [build-system]

@@ -25,12 +25,12 @@
 
 import attr
 
-from synapse.media.preview_html import parse_html_description
+from synapse.media.preview_html import NON_BLANK, decode_body, parse_html_description
 from synapse.types import JsonDict
 from synapse.util.json import json_decoder
 
 if TYPE_CHECKING:
-    from lxml import etree
+    from bs4 import BeautifulSoup
 
     from synapse.server import HomeServer
 
@@ -105,35 +105,25 @@ def get_oembed_url(self, url: str) -> str | None:
         # No match.
         return None
 
-    def autodiscover_from_html(self, tree: "etree._Element") -> str | None:
+    def autodiscover_from_html(self, soup: "BeautifulSoup") -> str | None:
         """
         Search an HTML document for oEmbed autodiscovery information.
 
         Args:
-            tree: The parsed HTML body.
+            soup: The parsed HTML body.
 
         Returns:
             The URL to use for oEmbed information, or None if no URL was found.
         """
         # Search for link elements with the proper rel and type attributes.
-        # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
-        for tag in cast(
-            list["etree._Element"],
-            tree.xpath("//link[@rel='alternate'][@type='application/json+oembed']"),
-        ):
-            if "href" in tag.attrib:
-                return cast(str, tag.attrib["href"])
-
         # Some providers (e.g. Flickr) use alternative instead of alternate.
-        # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
-        for tag in cast(
-            list["etree._Element"],
-            tree.xpath("//link[@rel='alternative'][@type='application/json+oembed']"),
-        ):
-            if "href" in tag.attrib:
-                return cast(str, tag.attrib["href"])
-
-        return None
+        tag = soup.find(
+            "link",
+            rel=("alternate", "alternative"),
+            type="application/json+oembed",
+            href=NON_BLANK,
+        )
+        return cast(str, tag["href"]) if tag else None
-        return cast(str, tag["href"]) if tag else None
+        tag = soup.find(
+            "link",
+            rel=("alternate", "alternative"),
+            type="application/json+oembed",
+        )
+        href = tag.get("href")
+        return cast(str, href) if href else None
-        return cast(str, tag["href"]) if tag else None
+        tags = soup.find_all(
+            "link",
+            rel=("alternate", "alternative"),
+            type="application/json+oembed",
+        )
+        if len(tags) == 0:
+            return None
+        elif len(tags) == 1:
+            tag = tags[0]
+            href = tag.get("href")
+            return cast(str, href) if href else None
+        else:
+            # If there are multiple tags, return an error. We don't want to even
+            # try to pick the right one if there are multiple as we could run into
+            # problems similar to request smuggling vulnerabilities which rely on the
+            # mismatch of how different systems interpret information.
+            raise ValueError(
+                'Expected one `<link type="application/json+oembed">` but found multiple.'
+            )
-        return cast(str, tag["href"]) if tag else None
+        tag = soup.find(
+            "link",
+            rel=("alternate", "alternative"),
+            type="application/json+oembed",
+        )
+        href = tag.get("href")
+        return cast(str, href) if href else None
-        return cast(str, tag["href"]) if tag else None
+        tags = soup.find_all(
+            "link",
+            rel=("alternate", "alternative"),
+            type="application/json+oembed",
+        )
+        if len(tags) == 0:
+            return None
+        elif len(tags) == 1:
+            tag = tags[0]
+            href = tag.get("href")
+            return cast(str, href) if href else None
+        else:
+            # If there are multiple tags, return an error. We don't want to even
+            # try to pick the right one if there are multiple as we could run into
+            # problems similar to request smuggling vulnerabilities which rely on the
+            # mismatch of how different systems interpret information.
+            raise ValueError(
+                'Expected one `<link type="application/json+oembed">` but found multiple.'
+            )
 
     def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
         """
@@ -196,7 +186,7 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
         if oembed_type == "rich":
             html_str = oembed.get("html")
             if isinstance(html_str, str):
-                calc_description_and_urls(open_graph_response, html_str)
+                calc_description_and_urls(open_graph_response, html_str, url)
 
         elif oembed_type == "photo":
             # If this is a photo, use the full image, not the thumbnail.
@@ -208,7 +198,7 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
             open_graph_response["og:type"] = "video.other"
             html_str = oembed.get("html")
             if html_str and isinstance(html_str, str):
-                calc_description_and_urls(open_graph_response, oembed["html"])
+                calc_description_and_urls(open_graph_response, oembed["html"], url)
             for size in ("width", "height"):
                 val = oembed.get(size)
                 if type(val) is int:  # noqa: E721
@@ -223,55 +213,45 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
         return OEmbedResult(open_graph_response, author_name, cache_age)
 
 
-def _fetch_urls(tree: "etree._Element", tag_name: str) -> list[str]:
-    results = []
-    # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
-    for tag in cast(list["etree._Element"], tree.xpath("//*/" + tag_name)):
-        if "src" in tag.attrib:
-            results.append(cast(str, tag.attrib["src"]))
-    return results
+def _fetch_url(soup: "BeautifulSoup", tag_name: str) -> str | None:
+    tag = soup.find(tag_name, src=NON_BLANK)
+    return cast(str, tag["src"]) if tag else None
 
 
-def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None:
+def calc_description_and_urls(
+    open_graph_response: JsonDict, html_body: str, url: str
+) -> None:
     """
     Calculate description for an HTML document.
 
-    This uses lxml to convert the HTML document into plaintext. If errors
+    This uses BeautifulSoup to convert the HTML document into plaintext. If errors
     occur during processing of the document, an empty response is returned.
 
     Args:
         open_graph_response: The current Open Graph summary. This is updated with additional fields.
         html_body: The HTML document, as bytes.
-
-    Returns:
-        The summary
+        url: The URL which is being previewed (not the one which was requested).
     """
-    # If there's no body, nothing useful is going to be found.
-    if not html_body:
-        return
+    soup = decode_body(html_body, url)
 
-    from lxml import etree
-
-    # Create an HTML parser. If this fails, log and return no metadata.
-    parser = etree.HTMLParser(recover=True, encoding="utf-8")
-
-    # Attempt to parse the body. If this fails, log and return no metadata.
-    tree = etree.fromstring(html_body, parser)
-
-    # The data was successfully parsed, but no tree was found.
-    if tree is None:
+    # If there's no body, nothing useful is going to be found.
+    if not soup:
         return
 
     # Attempt to find interesting URLs (images, videos, embeds).
     if "og:image" not in open_graph_response:
-        image_urls = _fetch_urls(tree, "img")
-        if image_urls:
-            open_graph_response["og:image"] = image_urls[0]
-
-    video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed")
-    if video_urls:
-        open_graph_response["og:video"] = video_urls[0]
-
-    description = parse_html_description(tree)
+        image_url = _fetch_url(soup, "img")
+        if image_url:
+            open_graph_response["og:image"] = image_url
+
+    video_url = _fetch_url(soup, "video")
+    if video_url:
+        open_graph_response["og:video"] = video_url
+    else:
+        embed_url = _fetch_url(soup, "embed")
+        if embed_url:
+            open_graph_response["og:video"] = embed_url
+
+    description = parse_html_description(soup)
     if description:
         open_graph_response["og:description"] = description
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Switch to beautofulsoup4 from lxml for URL previews. Controbuted by @clokep.
clokep marked this conversation as resolved. Outdated Show resolved Hide resolved clokep marked this conversation as resolved. Outdated Show resolved Hide resolved clokep marked this conversation as resolved. Outdated Show resolved Hide resolved Copy link Contributor MadLittleMods Dec 29, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. I haven't evaluated whether `beautifulsoup4` is a good dependency choice. The source is on launchpad.net which means it sucks to browse casually and the UI sucks, https://code.launchpad.net/beautifulsoup Copy link Contributor Author clokep Dec 29, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. beautifulsoup is the go to package when parsing HTML in Python and has been for at least a decade.