2222import shutil
2323import sys
2424import traceback
25- from typing import TYPE_CHECKING , Dict , Generator , Iterable , Optional , Union
25+ from typing import TYPE_CHECKING , Dict , Generator , Iterable , Optional , Tuple , Union
2626from urllib import parse as urlparse
2727
2828import attr
@@ -296,22 +296,32 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes:
296296 body = file .read ()
297297
298298 encoding = get_html_media_encoding (body , media_info .media_type )
299- og = decode_and_calc_og (body , media_info .uri , encoding )
300-
301- await self ._precache_image_url (user , media_info , og )
302-
303- elif oembed_url and _is_json (media_info .media_type ):
304- # Handle an oEmbed response.
305- with open (media_info .filename , "rb" ) as file :
306- body = file .read ()
307-
308- oembed_response = self ._oembed .parse_oembed_response (url , body )
309- og = oembed_response .open_graph_result
310-
311- # Use the cache age from the oEmbed result, instead of the HTTP response.
312- if oembed_response .cache_age is not None :
313- expiration_ms = oembed_response .cache_age
299+ tree = decode_body (body , encoding )
300+ if tree is not None :
301+ # Check if this HTML document points to oEmbed information and
302+ # defer to that.
303+ oembed_url = self ._oembed .autodiscover_from_html (tree )
304+ og = {}
305+ if oembed_url :
306+ oembed_info = await self ._download_url (oembed_url , user )
307+ og , expiration_ms = await self ._handle_oembed_response (
308+ url , oembed_info , expiration_ms
309+ )
310+
311+ # If there was no oEmbed URL (or oEmbed parsing failed), attempt
312+ # to generate the Open Graph information from the HTML.
313+ if not oembed_url or not og :
314+ og = _calc_og (tree , media_info .uri )
315+
316+ await self ._precache_image_url (user , media_info , og )
317+ else :
318+ og = {}
314319
320+ elif oembed_url :
321+ # Handle the oEmbed information.
322+ og , expiration_ms = await self ._handle_oembed_response (
323+ url , media_info , expiration_ms
324+ )
315325 await self ._precache_image_url (user , media_info , og )
316326
317327 else :
@@ -479,6 +489,39 @@ async def _precache_image_url(
479489 else :
480490 del og ["og:image" ]
481491
492+ async def _handle_oembed_response (
493+ self , url : str , media_info : MediaInfo , expiration_ms : int
494+ ) -> Tuple [JsonDict , int ]:
495+ """
496+ Parse the downloaded oEmbed info.
497+
498+ Args:
499+ url: The URL which is being previewed (not the one which was
500+ requested).
501+ media_info: The media being previewed.
502+ expiration_ms: The length of time, in milliseconds, the media is valid for.
503+
504+ Returns:
505+ A tuple of:
506+ The Open Graph dictionary, if the oEmbed info can be parsed.
507+ The (possibly updated) length of time, in milliseconds, the media is valid for.
508+ """
509+ # If JSON was not returned, there's nothing to do.
510+ if not _is_json (media_info .media_type ):
511+ return {}, expiration_ms
512+
513+ with open (media_info .filename , "rb" ) as file :
514+ body = file .read ()
515+
516+ oembed_response = self ._oembed .parse_oembed_response (url , body )
517+ open_graph_result = oembed_response .open_graph_result
518+
519+ # Use the cache age from the oEmbed result, if one was given.
520+ if open_graph_result and oembed_response .cache_age is not None :
521+ expiration_ms = oembed_response .cache_age
522+
523+ return open_graph_result , expiration_ms
524+
482525 def _start_expire_url_cache_data (self ) -> Deferred :
483526 return run_as_background_process (
484527 "expire_url_cache_data" , self ._expire_url_cache_data
@@ -631,26 +674,22 @@ def get_html_media_encoding(body: bytes, content_type: str) -> str:
631674 return "utf-8"
632675
633676
634- def decode_and_calc_og (
635- body : bytes , media_uri : str , request_encoding : Optional [str ] = None
636- ) -> JsonDict :
677+ def decode_body (
678+ body : bytes , request_encoding : Optional [str ] = None
679+ ) -> Optional [ "etree.Element" ] :
637680 """
638- Calculate metadata for an HTML document.
639-
640- This uses lxml to parse the HTML document into the OG response. If errors
641- occur during processing of the document, an empty response is returned.
681+ This uses lxml to parse the HTML document.
642682
643683 Args:
644684 body: The HTML document, as bytes.
645- media_url: The URI used to download the body.
646685 request_encoding: The character encoding of the body, as a string.
647686
648687 Returns:
649- The OG response as a dictionary .
688+ The parsed HTML body, or None if an error occurred during processed .
650689 """
651690 # If there's no body, nothing useful is going to be found.
652691 if not body :
653- return {}
692+ return None
654693
655694 from lxml import etree
656695
@@ -662,25 +701,22 @@ def decode_and_calc_og(
662701 parser = etree .HTMLParser (recover = True , encoding = "utf-8" )
663702 except Exception as e :
664703 logger .warning ("Unable to create HTML parser: %s" % (e ,))
665- return {}
666-
667- def _attempt_calc_og (body_attempt : Union [bytes , str ]) -> Dict [str , Optional [str ]]:
668- # Attempt to parse the body. If this fails, log and return no metadata.
669- tree = etree .fromstring (body_attempt , parser )
670-
671- # The data was successfully parsed, but no tree was found.
672- if tree is None :
673- return {}
704+ return None
674705
675- return _calc_og (tree , media_uri )
706+ def _attempt_decode_body (
707+ body_attempt : Union [bytes , str ]
708+ ) -> Optional ["etree.Element" ]:
709+ # Attempt to parse the body. Returns None if the body was successfully
710+ # parsed, but no tree was found.
711+ return etree .fromstring (body_attempt , parser )
676712
677713 # Attempt to parse the body. If this fails, log and return no metadata.
678714 try :
679- return _attempt_calc_og (body )
715+ return _attempt_decode_body (body )
680716 except UnicodeDecodeError :
681717 # blindly try decoding the body as utf-8, which seems to fix
682718 # the charset mismatches on https://google.com
683- return _attempt_calc_og (body .decode ("utf-8" , "ignore" ))
719+ return _attempt_decode_body (body .decode ("utf-8" , "ignore" ))
684720
685721
686722def _calc_og (tree : "etree.Element" , media_uri : str ) -> Dict [str , Optional [str ]]:
0 commit comments