Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 1b11284

Browse files
authored
Autodiscover oEmbed endpoint from returned HTML (#10822)
Searches the returned HTML for an oEmbed endpoint using the autodiscovery mechanism (`<link rel=...>`), and will request it to generate the preview.
1 parent 593eeac commit 1b11284

File tree

5 files changed

+224
-55
lines changed

5 files changed

+224
-55
lines changed

changelog.d/10822.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Support autodiscovery of oEmbed previews.

synapse/rest/media/v1/oembed.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,32 @@ def get_oembed_url(self, url: str) -> Optional[str]:
9696
# No match.
9797
return None
9898

99+
def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]:
100+
"""
101+
Search an HTML document for oEmbed autodiscovery information.
102+
103+
Args:
104+
tree: The parsed HTML body.
105+
106+
Returns:
107+
The URL to use for oEmbed information, or None if no URL was found.
108+
"""
109+
# Search for link elements with the proper rel and type attributes.
110+
for tag in tree.xpath(
111+
"//link[@rel='alternate'][@type='application/json+oembed']"
112+
):
113+
if "href" in tag.attrib:
114+
return tag.attrib["href"]
115+
116+
# Some providers (e.g. Flickr) use alternative instead of alternate.
117+
for tag in tree.xpath(
118+
"//link[@rel='alternative'][@type='application/json+oembed']"
119+
):
120+
if "href" in tag.attrib:
121+
return tag.attrib["href"]
122+
123+
return None
124+
99125
def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
100126
"""
101127
Parse the oEmbed response into an Open Graph response.

synapse/rest/media/v1/preview_url_resource.py

Lines changed: 74 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
import shutil
2323
import sys
2424
import traceback
25-
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Union
25+
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Tuple, Union
2626
from urllib import parse as urlparse
2727

2828
import attr
@@ -296,22 +296,32 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes:
296296
body = file.read()
297297

298298
encoding = get_html_media_encoding(body, media_info.media_type)
299-
og = decode_and_calc_og(body, media_info.uri, encoding)
300-
301-
await self._precache_image_url(user, media_info, og)
302-
303-
elif oembed_url and _is_json(media_info.media_type):
304-
# Handle an oEmbed response.
305-
with open(media_info.filename, "rb") as file:
306-
body = file.read()
307-
308-
oembed_response = self._oembed.parse_oembed_response(url, body)
309-
og = oembed_response.open_graph_result
310-
311-
# Use the cache age from the oEmbed result, instead of the HTTP response.
312-
if oembed_response.cache_age is not None:
313-
expiration_ms = oembed_response.cache_age
299+
tree = decode_body(body, encoding)
300+
if tree is not None:
301+
# Check if this HTML document points to oEmbed information and
302+
# defer to that.
303+
oembed_url = self._oembed.autodiscover_from_html(tree)
304+
og = {}
305+
if oembed_url:
306+
oembed_info = await self._download_url(oembed_url, user)
307+
og, expiration_ms = await self._handle_oembed_response(
308+
url, oembed_info, expiration_ms
309+
)
310+
311+
# If there was no oEmbed URL (or oEmbed parsing failed), attempt
312+
# to generate the Open Graph information from the HTML.
313+
if not oembed_url or not og:
314+
og = _calc_og(tree, media_info.uri)
315+
316+
await self._precache_image_url(user, media_info, og)
317+
else:
318+
og = {}
314319

320+
elif oembed_url:
321+
# Handle the oEmbed information.
322+
og, expiration_ms = await self._handle_oembed_response(
323+
url, media_info, expiration_ms
324+
)
315325
await self._precache_image_url(user, media_info, og)
316326

317327
else:
@@ -479,6 +489,39 @@ async def _precache_image_url(
479489
else:
480490
del og["og:image"]
481491

492+
async def _handle_oembed_response(
493+
self, url: str, media_info: MediaInfo, expiration_ms: int
494+
) -> Tuple[JsonDict, int]:
495+
"""
496+
Parse the downloaded oEmbed info.
497+
498+
Args:
499+
url: The URL which is being previewed (not the one which was
500+
requested).
501+
media_info: The media being previewed.
502+
expiration_ms: The length of time, in milliseconds, the media is valid for.
503+
504+
Returns:
505+
A tuple of:
506+
The Open Graph dictionary, if the oEmbed info can be parsed.
507+
The (possibly updated) length of time, in milliseconds, the media is valid for.
508+
"""
509+
# If JSON was not returned, there's nothing to do.
510+
if not _is_json(media_info.media_type):
511+
return {}, expiration_ms
512+
513+
with open(media_info.filename, "rb") as file:
514+
body = file.read()
515+
516+
oembed_response = self._oembed.parse_oembed_response(url, body)
517+
open_graph_result = oembed_response.open_graph_result
518+
519+
# Use the cache age from the oEmbed result, if one was given.
520+
if open_graph_result and oembed_response.cache_age is not None:
521+
expiration_ms = oembed_response.cache_age
522+
523+
return open_graph_result, expiration_ms
524+
482525
def _start_expire_url_cache_data(self) -> Deferred:
483526
return run_as_background_process(
484527
"expire_url_cache_data", self._expire_url_cache_data
@@ -631,26 +674,22 @@ def get_html_media_encoding(body: bytes, content_type: str) -> str:
631674
return "utf-8"
632675

633676

634-
def decode_and_calc_og(
635-
body: bytes, media_uri: str, request_encoding: Optional[str] = None
636-
) -> JsonDict:
677+
def decode_body(
678+
body: bytes, request_encoding: Optional[str] = None
679+
) -> Optional["etree.Element"]:
637680
"""
638-
Calculate metadata for an HTML document.
639-
640-
This uses lxml to parse the HTML document into the OG response. If errors
641-
occur during processing of the document, an empty response is returned.
681+
This uses lxml to parse the HTML document.
642682
643683
Args:
644684
body: The HTML document, as bytes.
645-
media_url: The URI used to download the body.
646685
request_encoding: The character encoding of the body, as a string.
647686
648687
Returns:
649-
The OG response as a dictionary.
688+
The parsed HTML body, or None if an error occurred during processed.
650689
"""
651690
# If there's no body, nothing useful is going to be found.
652691
if not body:
653-
return {}
692+
return None
654693

655694
from lxml import etree
656695

@@ -662,25 +701,22 @@ def decode_and_calc_og(
662701
parser = etree.HTMLParser(recover=True, encoding="utf-8")
663702
except Exception as e:
664703
logger.warning("Unable to create HTML parser: %s" % (e,))
665-
return {}
666-
667-
def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
668-
# Attempt to parse the body. If this fails, log and return no metadata.
669-
tree = etree.fromstring(body_attempt, parser)
670-
671-
# The data was successfully parsed, but no tree was found.
672-
if tree is None:
673-
return {}
704+
return None
674705

675-
return _calc_og(tree, media_uri)
706+
def _attempt_decode_body(
707+
body_attempt: Union[bytes, str]
708+
) -> Optional["etree.Element"]:
709+
# Attempt to parse the body. Returns None if the body was successfully
710+
# parsed, but no tree was found.
711+
return etree.fromstring(body_attempt, parser)
676712

677713
# Attempt to parse the body. If this fails, log and return no metadata.
678714
try:
679-
return _attempt_calc_og(body)
715+
return _attempt_decode_body(body)
680716
except UnicodeDecodeError:
681717
# blindly try decoding the body as utf-8, which seems to fix
682718
# the charset mismatches on https://google.com
683-
return _attempt_calc_og(body.decode("utf-8", "ignore"))
719+
return _attempt_decode_body(body.decode("utf-8", "ignore"))
684720

685721

686722
def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:

tests/rest/media/v1/test_url_preview.py

Lines changed: 99 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -725,9 +725,107 @@ def test_oembed_format(self):
725725
},
726726
)
727727

728+
def test_oembed_autodiscovery(self):
729+
"""
730+
Autodiscovery works by finding the link in the HTML response and then requesting an oEmbed URL.
731+
1. Request a preview of a URL which is not known to the oEmbed code.
732+
2. It returns HTML including a link to an oEmbed preview.
733+
3. The oEmbed preview is requested and returns a URL for an image.
734+
4. The image is requested for thumbnailing.
735+
"""
736+
# This is a little cheesy in that we use the www subdomain (which isn't the
737+
# list of oEmbed patterns) to get "raw" HTML response.
738+
self.lookups["www.twitter.com"] = [(IPv4Address, "10.1.2.3")]
739+
self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
740+
self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
741+
742+
result = b"""
743+
<link rel="alternate" type="application/json+oembed"
744+
href="http://publish.twitter.com/oembed?url=http%3A%2F%2Fcdn.twitter.com%2Fmatrixdotorg%2Fstatus%2F12345&format=json"
745+
title="matrixdotorg" />
746+
"""
747+
748+
channel = self.make_request(
749+
"GET",
750+
"preview_url?url=http://www.twitter.com/matrixdotorg/status/12345",
751+
shorthand=False,
752+
await_result=False,
753+
)
754+
self.pump()
755+
756+
client = self.reactor.tcpClients[0][2].buildProtocol(None)
757+
server = AccumulatingProtocol()
758+
server.makeConnection(FakeTransport(client, self.reactor))
759+
client.makeConnection(FakeTransport(server, self.reactor))
760+
client.dataReceived(
761+
(
762+
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
763+
b'Content-Type: text/html; charset="utf8"\r\n\r\n'
764+
)
765+
% (len(result),)
766+
+ result
767+
)
768+
769+
self.pump()
770+
771+
# The oEmbed response.
772+
result2 = {
773+
"version": "1.0",
774+
"type": "photo",
775+
"url": "http://cdn.twitter.com/matrixdotorg",
776+
}
777+
oembed_content = json.dumps(result2).encode("utf-8")
778+
779+
# Ensure a second request is made to the oEmbed URL.
780+
client = self.reactor.tcpClients[1][2].buildProtocol(None)
781+
server = AccumulatingProtocol()
782+
server.makeConnection(FakeTransport(client, self.reactor))
783+
client.makeConnection(FakeTransport(server, self.reactor))
784+
client.dataReceived(
785+
(
786+
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
787+
b'Content-Type: application/json; charset="utf8"\r\n\r\n'
788+
)
789+
% (len(oembed_content),)
790+
+ oembed_content
791+
)
792+
793+
self.pump()
794+
795+
# Ensure the URL is what was requested.
796+
self.assertIn(b"/oembed?", server.data)
797+
798+
# Ensure a third request is made to the photo URL.
799+
client = self.reactor.tcpClients[2][2].buildProtocol(None)
800+
server = AccumulatingProtocol()
801+
server.makeConnection(FakeTransport(client, self.reactor))
802+
client.makeConnection(FakeTransport(server, self.reactor))
803+
client.dataReceived(
804+
(
805+
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
806+
b"Content-Type: image/png\r\n\r\n"
807+
)
808+
% (len(SMALL_PNG),)
809+
+ SMALL_PNG
810+
)
811+
812+
self.pump()
813+
814+
# Ensure the URL is what was requested.
815+
self.assertIn(b"/matrixdotorg", server.data)
816+
817+
self.assertEqual(channel.code, 200)
818+
body = channel.json_body
819+
self.assertEqual(
820+
body["og:url"], "http://www.twitter.com/matrixdotorg/status/12345"
821+
)
822+
self.assertTrue(body["og:image"].startswith("mxc://"))
823+
self.assertEqual(body["og:image:height"], 1)
824+
self.assertEqual(body["og:image:width"], 1)
825+
self.assertEqual(body["og:image:type"], "image/png")
826+
728827
def _download_image(self):
729828
"""Downloads an image into the URL cache.
730-
731829
Returns:
732830
A (host, media_id) tuple representing the MXC URI of the image.
733831
"""

0 commit comments

Comments
 (0)