Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit e2481db

Browse files
authored
Allow configuration of the oEmbed URLs. (#10714)
This adds configuration options (under an `oembed` section) to configure which URLs are matched to use oEmbed for URL previews.
1 parent 287918e commit e2481db

File tree

8 files changed

+463
-252
lines changed

8 files changed

+463
-252
lines changed

changelog.d/10714.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Allow configuration of the oEmbed URLs used for URL previews.

docs/sample_config.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1075,6 +1075,27 @@ url_preview_accept_language:
10751075
# - en
10761076

10771077

1078+
# oEmbed allows for easier embedding content from a website. It can be
1079+
# used for generating URLs previews of services which support it.
1080+
#
1081+
oembed:
1082+
# A default list of oEmbed providers is included with Synapse.
1083+
#
1084+
# Uncomment the following to disable using these default oEmbed URLs.
1085+
# Defaults to 'false'.
1086+
#
1087+
#disable_default_providers: true
1088+
1089+
# Additional files with oEmbed configuration (each should be in the
1090+
# form of providers.json).
1091+
#
1092+
# By default, this list is empty (so only the default providers.json
1093+
# is used).
1094+
#
1095+
#additional_providers:
1096+
# - oembed/my_providers.json
1097+
1098+
10781099
## Captcha ##
10791100
# See docs/CAPTCHA_SETUP.md for full details of configuring this.
10801101

synapse/config/homeserver.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from .logger import LoggingConfig
3131
from .metrics import MetricsConfig
3232
from .modules import ModulesConfig
33+
from .oembed import OembedConfig
3334
from .oidc import OIDCConfig
3435
from .password_auth_providers import PasswordAuthProviderConfig
3536
from .push import PushConfig
@@ -65,6 +66,7 @@ class HomeServerConfig(RootConfig):
6566
LoggingConfig,
6667
RatelimitConfig,
6768
ContentRepositoryConfig,
69+
OembedConfig,
6870
CaptchaConfig,
6971
VoipConfig,
7072
RegistrationConfig,

synapse/config/oembed.py

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
# Copyright 2021 The Matrix.org Foundation C.I.C.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import json
15+
import re
16+
from typing import Any, Dict, Iterable, List, Pattern
17+
from urllib import parse as urlparse
18+
19+
import attr
20+
import pkg_resources
21+
22+
from synapse.types import JsonDict
23+
24+
from ._base import Config, ConfigError
25+
from ._util import validate_config
26+
27+
28+
@attr.s(slots=True, frozen=True, auto_attribs=True)
29+
class OEmbedEndpointConfig:
30+
# The API endpoint to fetch.
31+
api_endpoint: str
32+
# The patterns to match.
33+
url_patterns: List[Pattern]
34+
35+
36+
class OembedConfig(Config):
37+
"""oEmbed Configuration"""
38+
39+
section = "oembed"
40+
41+
def read_config(self, config, **kwargs):
42+
oembed_config: Dict[str, Any] = config.get("oembed") or {}
43+
44+
# A list of patterns which will be used.
45+
self.oembed_patterns: List[OEmbedEndpointConfig] = list(
46+
self._parse_and_validate_providers(oembed_config)
47+
)
48+
49+
def _parse_and_validate_providers(
50+
self, oembed_config: dict
51+
) -> Iterable[OEmbedEndpointConfig]:
52+
"""Extract and parse the oEmbed providers from the given JSON file.
53+
54+
Returns a generator which yields the OidcProviderConfig objects
55+
"""
56+
# Whether to use the packaged providers.json file.
57+
if not oembed_config.get("disable_default_providers") or False:
58+
providers = json.load(
59+
pkg_resources.resource_stream("synapse", "res/providers.json")
60+
)
61+
yield from self._parse_and_validate_provider(
62+
providers, config_path=("oembed",)
63+
)
64+
65+
# The JSON files which includes additional provider information.
66+
for i, file in enumerate(oembed_config.get("additional_providers") or []):
67+
# TODO Error checking.
68+
with open(file) as f:
69+
providers = json.load(f)
70+
71+
yield from self._parse_and_validate_provider(
72+
providers,
73+
config_path=(
74+
"oembed",
75+
"additional_providers",
76+
f"<item {i}>",
77+
),
78+
)
79+
80+
def _parse_and_validate_provider(
81+
self, providers: List[JsonDict], config_path: Iterable[str]
82+
) -> Iterable[OEmbedEndpointConfig]:
83+
# Ensure it is the proper form.
84+
validate_config(
85+
_OEMBED_PROVIDER_SCHEMA,
86+
providers,
87+
config_path=config_path,
88+
)
89+
90+
# Parse it and yield each result.
91+
for provider in providers:
92+
# Each provider might have multiple API endpoints, each which
93+
# might have multiple patterns to match.
94+
for endpoint in provider["endpoints"]:
95+
api_endpoint = endpoint["url"]
96+
patterns = [
97+
self._glob_to_pattern(glob, config_path)
98+
for glob in endpoint["schemes"]
99+
]
100+
yield OEmbedEndpointConfig(api_endpoint, patterns)
101+
102+
def _glob_to_pattern(self, glob: str, config_path: Iterable[str]) -> Pattern:
103+
"""
104+
Convert the glob into a sane regular expression to match against. The
105+
rules followed will be slightly different for the domain portion vs.
106+
the rest.
107+
108+
1. The scheme must be one of HTTP / HTTPS (and have no globs).
109+
2. The domain can have globs, but we limit it to characters that can
110+
reasonably be a domain part.
111+
TODO: This does not attempt to handle Unicode domain names.
112+
TODO: The domain should not allow wildcard TLDs.
113+
3. Other parts allow a glob to be any one, or more, characters.
114+
"""
115+
results = urlparse.urlparse(glob)
116+
117+
# Ensure the scheme does not have wildcards (and is a sane scheme).
118+
if results.scheme not in {"http", "https"}:
119+
raise ConfigError(f"Insecure oEmbed scheme: {results.scheme}", config_path)
120+
121+
pattern = urlparse.urlunparse(
122+
[
123+
results.scheme,
124+
re.escape(results.netloc).replace("\\*", "[a-zA-Z0-9_-]+"),
125+
]
126+
+ [re.escape(part).replace("\\*", ".+") for part in results[2:]]
127+
)
128+
return re.compile(pattern)
129+
130+
def generate_config_section(self, **kwargs):
131+
return """\
132+
# oEmbed allows for easier embedding content from a website. It can be
133+
# used for generating URLs previews of services which support it.
134+
#
135+
oembed:
136+
# A default list of oEmbed providers is included with Synapse.
137+
#
138+
# Uncomment the following to disable using these default oEmbed URLs.
139+
# Defaults to 'false'.
140+
#
141+
#disable_default_providers: true
142+
143+
# Additional files with oEmbed configuration (each should be in the
144+
# form of providers.json).
145+
#
146+
# By default, this list is empty (so only the default providers.json
147+
# is used).
148+
#
149+
#additional_providers:
150+
# - oembed/my_providers.json
151+
"""
152+
153+
154+
_OEMBED_PROVIDER_SCHEMA = {
155+
"type": "array",
156+
"items": {
157+
"type": "object",
158+
"properties": {
159+
"provider_name": {"type": "string"},
160+
"provider_url": {"type": "string"},
161+
"endpoints": {
162+
"type": "array",
163+
"items": {
164+
"type": "object",
165+
"properties": {
166+
"schemes": {
167+
"type": "array",
168+
"items": {"type": "string"},
169+
},
170+
"url": {"type": "string"},
171+
"formats": {"type": "array", "items": {"type": "string"}},
172+
"discovery": {"type": "boolean"},
173+
},
174+
"required": ["schemes", "url"],
175+
},
176+
},
177+
},
178+
"required": ["provider_name", "provider_url", "endpoints"],
179+
},
180+
}

synapse/res/providers.json

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
[
2+
{
3+
"provider_name": "Twitter",
4+
"provider_url": "http://www.twitter.com/",
5+
"endpoints": [
6+
{
7+
"schemes": [
8+
"https://twitter.com/*/status/*",
9+
"https://*.twitter.com/*/status/*",
10+
"https://twitter.com/*/moments/*",
11+
"https://*.twitter.com/*/moments/*"
12+
],
13+
"url": "https://publish.twitter.com/oembed"
14+
}
15+
]
16+
}
17+
]

synapse/rest/media/v1/oembed.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
# Copyright 2021 The Matrix.org Foundation C.I.C.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import logging
15+
from typing import TYPE_CHECKING, Optional
16+
17+
import attr
18+
19+
from synapse.http.client import SimpleHttpClient
20+
21+
if TYPE_CHECKING:
22+
from synapse.server import HomeServer
23+
24+
logger = logging.getLogger(__name__)
25+
26+
27+
@attr.s(slots=True, auto_attribs=True)
28+
class OEmbedResult:
29+
# Either HTML content or URL must be provided.
30+
html: Optional[str]
31+
url: Optional[str]
32+
title: Optional[str]
33+
# Number of seconds to cache the content.
34+
cache_age: int
35+
36+
37+
class OEmbedError(Exception):
38+
"""An error occurred processing the oEmbed object."""
39+
40+
41+
class OEmbedProvider:
42+
"""
43+
A helper for accessing oEmbed content.
44+
45+
It can be used to check if a URL should be accessed via oEmbed and for
46+
requesting/parsing oEmbed content.
47+
"""
48+
49+
def __init__(self, hs: "HomeServer", client: SimpleHttpClient):
50+
self._oembed_patterns = {}
51+
for oembed_endpoint in hs.config.oembed.oembed_patterns:
52+
for pattern in oembed_endpoint.url_patterns:
53+
self._oembed_patterns[pattern] = oembed_endpoint.api_endpoint
54+
self._client = client
55+
56+
def get_oembed_url(self, url: str) -> Optional[str]:
57+
"""
58+
Check whether the URL should be downloaded as oEmbed content instead.
59+
60+
Args:
61+
url: The URL to check.
62+
63+
Returns:
64+
A URL to use instead or None if the original URL should be used.
65+
"""
66+
for url_pattern, endpoint in self._oembed_patterns.items():
67+
if url_pattern.fullmatch(url):
68+
return endpoint
69+
70+
# No match.
71+
return None
72+
73+
async def get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult:
74+
"""
75+
Request content from an oEmbed endpoint.
76+
77+
Args:
78+
endpoint: The oEmbed API endpoint.
79+
url: The URL to pass to the API.
80+
81+
Returns:
82+
An object representing the metadata returned.
83+
84+
Raises:
85+
OEmbedError if fetching or parsing of the oEmbed information fails.
86+
"""
87+
try:
88+
logger.debug("Trying to get oEmbed content for url '%s'", url)
89+
result = await self._client.get_json(
90+
endpoint,
91+
# TODO Specify max height / width.
92+
# Note that only the JSON format is supported.
93+
args={"url": url},
94+
)
95+
96+
# Ensure there's a version of 1.0.
97+
if result.get("version") != "1.0":
98+
raise OEmbedError("Invalid version: %s" % (result.get("version"),))
99+
100+
oembed_type = result.get("type")
101+
102+
# Ensure the cache age is None or an int.
103+
cache_age = result.get("cache_age")
104+
if cache_age:
105+
cache_age = int(cache_age)
106+
107+
oembed_result = OEmbedResult(None, None, result.get("title"), cache_age)
108+
109+
# HTML content.
110+
if oembed_type == "rich":
111+
oembed_result.html = result.get("html")
112+
return oembed_result
113+
114+
if oembed_type == "photo":
115+
oembed_result.url = result.get("url")
116+
return oembed_result
117+
118+
# TODO Handle link and video types.
119+
120+
if "thumbnail_url" in result:
121+
oembed_result.url = result.get("thumbnail_url")
122+
return oembed_result
123+
124+
raise OEmbedError("Incompatible oEmbed information.")
125+
126+
except OEmbedError as e:
127+
# Trap OEmbedErrors first so we can directly re-raise them.
128+
logger.warning("Error parsing oEmbed metadata from %s: %r", url, e)
129+
raise
130+
131+
except Exception as e:
132+
# Trap any exception and let the code follow as usual.
133+
# FIXME: pass through 404s and other error messages nicely
134+
logger.warning("Error downloading oEmbed metadata from %s: %r", url, e)
135+
raise OEmbedError() from e

0 commit comments

Comments
 (0)