Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/19301.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Switch to beautofulsoup4 from lxml for URL previews. Controbuted by @clokep.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't evaluated whether beautifulsoup4 is a good dependency choice. The source is on launchpad.net which means it sucks to browse casually and the UI sucks, https://code.launchpad.net/beautifulsoup

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

beautifulsoup is the go to package when parsing HTML in Python and has been for at least a decade.

4 changes: 0 additions & 4 deletions docs/setup/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -633,10 +633,6 @@ This is critical from a security perspective to stop arbitrary Matrix users
spidering 'internal' URLs on your network. At the very least we recommend that
your loopback and RFC1918 IP addresses are blacklisted.

This also requires the optional `lxml` python dependency to be installed. This
in turn requires the `libxml2` library to be available - on Debian/Ubuntu this
means `apt-get install libxml2-dev`, or equivalent for your OS.

### Backups

Don't forget to take [backups](../usage/administration/backups.md) of your new server!
Expand Down
215 changes: 40 additions & 175 deletions poetry.lock

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ oidc = ["authlib>=0.15.1"]
# `systemd.journal.JournalHandler`, as is documented in
# `contrib/systemd/log_config.yaml`.
systemd = ["systemd-python>=231"]
url-preview = ["lxml>=4.6.3"]
url-preview = ["beautifulsoup4>=4.13.0"]
sentry = ["sentry-sdk>=0.7.2"]
opentracing = [
"jaeger-client>=4.2.0",
Expand Down Expand Up @@ -177,7 +177,7 @@ all = [
# oidc and jwt
"authlib>=0.15.1",
# url-preview
"lxml>=4.6.3",
"beautifulsoup4>=4.13.0",
# sentry
"sentry-sdk>=0.7.2",
# opentracing
Expand Down Expand Up @@ -261,7 +261,6 @@ generate-setup-file = true
ruff = "0.14.6"

# Typechecking
lxml-stubs = ">=0.4.0"
mypy = "*"
mypy-zope = "*"
types-bleach = ">=4.1.0"
Expand Down Expand Up @@ -436,6 +435,7 @@ sdist-include = [
"rust/build.rs",
"rust/src/**",
]

sdist-exclude = ["synapse/*.so"]

[build-system]
Expand Down
94 changes: 37 additions & 57 deletions synapse/media/oembed.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@

import attr

from synapse.media.preview_html import parse_html_description
from synapse.media.preview_html import NON_BLANK, decode_body, parse_html_description
from synapse.types import JsonDict
from synapse.util.json import json_decoder

if TYPE_CHECKING:
from lxml import etree
from bs4 import BeautifulSoup

from synapse.server import HomeServer

Expand Down Expand Up @@ -105,35 +105,25 @@ def get_oembed_url(self, url: str) -> str | None:
# No match.
return None

def autodiscover_from_html(self, tree: "etree._Element") -> str | None:
def autodiscover_from_html(self, soup: "BeautifulSoup") -> str | None:
"""
Search an HTML document for oEmbed autodiscovery information.

Args:
tree: The parsed HTML body.
soup: The parsed HTML body.

Returns:
The URL to use for oEmbed information, or None if no URL was found.
"""
# Search for link elements with the proper rel and type attributes.
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
for tag in cast(
list["etree._Element"],
tree.xpath("//link[@rel='alternate'][@type='application/json+oembed']"),
):
if "href" in tag.attrib:
return cast(str, tag.attrib["href"])

# Some providers (e.g. Flickr) use alternative instead of alternate.
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
for tag in cast(
list["etree._Element"],
tree.xpath("//link[@rel='alternative'][@type='application/json+oembed']"),
):
if "href" in tag.attrib:
return cast(str, tag.attrib["href"])

return None
tag = soup.find(
"link",
rel=("alternate", "alternative"),
type="application/json+oembed",
href=NON_BLANK,
)
return cast(str, tag["href"]) if tag else None
Copy link
Contributor

@MadLittleMods MadLittleMods Dec 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Safer lookups to avoid relying on assumptions from prior code:

Suggested change
return cast(str, tag["href"]) if tag else None
tag = soup.find(
"link",
rel=("alternate", "alternative"),
type="application/json+oembed",
)
href = tag.get("href")
return cast(str, href) if href else None

Prior art but ideally, we'd do even better:

Suggested change
return cast(str, tag["href"]) if tag else None
tags = soup.find_all(
"link",
rel=("alternate", "alternative"),
type="application/json+oembed",
)
if len(tags) == 0:
return None
elif len(tags) == 1:
tag = tags[0]
href = tag.get("href")
return cast(str, href) if href else None
else:
# If there are multiple tags, return an error. We don't want to even
# try to pick the right one if there are multiple as we could run into
# problems similar to request smuggling vulnerabilities which rely on the
# mismatch of how different systems interpret information.
raise ValueError(
'Expected one `<link type="application/json+oembed">` but found multiple.'
)

Also applies elsewhere below

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is correct -- we choose the first one, not give up if there are more than one.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see how this is safer or what assumptions the original code makes, can you clarify?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The previous code uses tag["href"] which we assume works because of href=NON_BLANK, (tenuous separate constraint). Without that, href might not exist and would result in KeyError or None and then we're casing to str.

This is a Parse, don't validate type of situation. Ideally, the returned type could guarantee us what we just validated. But instead we get back a generic type with all of that information lost.

(there are other NON_BLANK usages like this as well)

I don't think this is correct -- we choose the first one, not give up if there are more than one.

Is it normal to have multiple tags on the page? If so, searching for the first one may be fine. We should comment about our reasoning.

If a page should have one, we should be more strict.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The previous code uses tag["href"] which we assume works because of href=NON_BLANK, (tenuous separate constraint). Without that, href might not exist and would result in KeyError or None and then we're casing to str.

This is a Parse, don't validate type of situation. Ideally, the returned type could guarantee us what we just validated. But instead we get back a generic type with all of that information lost.

(there are other NON_BLANK usages like this as well)

I'm honestly not following this at all, sorry. What part do you dislike? Is it the cast to string? Or that we're assuming href is a property (which we already checked above with the filter)?

I don't think this is correct -- we choose the first one, not give up if there are more than one.

Is it normal to have multiple tags on the page? If so, searching for the first one may be fine. We should comment about our reasoning.

It is perfectly acceptable to have more than one.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is the best that we can do?

We can use safer lookups (tag.get("href")) and check things as my suggestion does 🤷

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a helper method that will hopefully help?

I understand what you're saying with unwrap, but it isn't helpful to discuss Rust semantics when writing Python code.

And even so, I don't think you'd want something as specific as a TagWithHref class, HTML is pretty generic so that doesn't make sense.

Copy link
Contributor Author

@clokep clokep Feb 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is the best that we can do?

We can use safer lookups (tag.get("href")) and check things as my suggestion does 🤷

But we already know that it exists? This would be akin to writing:

my_dict: dict[str, Foo]
if "my_key" in my_dict:
  my_value = my_dict.get("my_key")

It makes the value ambiguous when it isn't.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a helper method that will hopefully help?

The get_attribute(...) helper looks like it's addressing something else (single vs multi-valued attributes).


But we already know that it exists?

In this case, the functions are small so the cognitive load to keep things straight (even into the future) isn't that bad (smoke).

In Python, we see this kind of thing (making reasonable assumptions based on prior validation) because it easily passes by (regardless of how well those assumptions are formulated). My preference is on being defensive and asserting our assumptions especially to better protect things as they evolve into the future.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a helper method that will hopefully help?

The get_attribute(...) helper looks like it's addressing something else (single vs multi-valued attributes).

It is addressing why there were so many cast(...) calls which I thought is what you found confusing, but I guess I'm still not understanding.


def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
"""
Expand Down Expand Up @@ -196,7 +186,7 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
if oembed_type == "rich":
html_str = oembed.get("html")
if isinstance(html_str, str):
calc_description_and_urls(open_graph_response, html_str)
calc_description_and_urls(open_graph_response, html_str, url)

elif oembed_type == "photo":
# If this is a photo, use the full image, not the thumbnail.
Expand All @@ -208,7 +198,7 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
open_graph_response["og:type"] = "video.other"
html_str = oembed.get("html")
if html_str and isinstance(html_str, str):
calc_description_and_urls(open_graph_response, oembed["html"])
calc_description_and_urls(open_graph_response, oembed["html"], url)
for size in ("width", "height"):
val = oembed.get(size)
if type(val) is int: # noqa: E721
Expand All @@ -223,55 +213,45 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
return OEmbedResult(open_graph_response, author_name, cache_age)


def _fetch_urls(tree: "etree._Element", tag_name: str) -> list[str]:
results = []
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
for tag in cast(list["etree._Element"], tree.xpath("//*/" + tag_name)):
if "src" in tag.attrib:
results.append(cast(str, tag.attrib["src"]))
return results
def _fetch_url(soup: "BeautifulSoup", tag_name: str) -> str | None:
tag = soup.find(tag_name, src=NON_BLANK)
return cast(str, tag["src"]) if tag else None


def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None:
def calc_description_and_urls(
open_graph_response: JsonDict, html_body: str, url: str
) -> None:
"""
Calculate description for an HTML document.

This uses lxml to convert the HTML document into plaintext. If errors
This uses BeautifulSoup to convert the HTML document into plaintext. If errors
occur during processing of the document, an empty response is returned.

Args:
open_graph_response: The current Open Graph summary. This is updated with additional fields.
html_body: The HTML document, as bytes.

Returns:
The summary
url: The URL which is being previewed (not the one which was requested).
"""
# If there's no body, nothing useful is going to be found.
if not html_body:
return
soup = decode_body(html_body, url)

from lxml import etree

# Create an HTML parser. If this fails, log and return no metadata.
parser = etree.HTMLParser(recover=True, encoding="utf-8")

# Attempt to parse the body. If this fails, log and return no metadata.
tree = etree.fromstring(html_body, parser)

# The data was successfully parsed, but no tree was found.
if tree is None:
# If there's no body, nothing useful is going to be found.
if not soup:
return

# Attempt to find interesting URLs (images, videos, embeds).
if "og:image" not in open_graph_response:
image_urls = _fetch_urls(tree, "img")
if image_urls:
open_graph_response["og:image"] = image_urls[0]

video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed")
if video_urls:
open_graph_response["og:video"] = video_urls[0]

description = parse_html_description(tree)
image_url = _fetch_url(soup, "img")
if image_url:
open_graph_response["og:image"] = image_url

video_url = _fetch_url(soup, "video")
if video_url:
open_graph_response["og:video"] = video_url
else:
embed_url = _fetch_url(soup, "embed")
if embed_url:
open_graph_response["og:video"] = embed_url

description = parse_html_description(soup)
if description:
open_graph_response["og:description"] = description
Loading
Loading