Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions plugins/Web/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,4 +95,12 @@ def configure(advanced):
seconds the bot will wait for the site to respond, when using the 'fetch'
command in this plugin. If 0, will use socket.defaulttimeout"""))

conf.registerGlobalValue(Web, 'useOembedRegistry',
registry.Boolean(False, _("""Determines whether the bot will use the
oembed.com providers registry.""")))

conf.registerGlobalValue(Web, 'useOembedDiscovery',
registry.Boolean(False, _("""Determines whether the bot will use HTML
discovery to find oEmbed endpoints.""")))
Comment on lines +98 to +104
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Name the config variables plugins.Web.oembed.registry and plugins.Web.oembed.discovery.

Could you also make them channel-specific, but not op-settable?


# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:
90 changes: 81 additions & 9 deletions plugins/Web/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import sys
import string
import socket
import json

import supybot.conf as conf
import supybot.utils as utils
Expand Down Expand Up @@ -143,7 +144,23 @@ class Web(callbacks.PluginRegexp):
"""Add the help for 'help Web' here."""
regexps = ['titleSnarfer']
threaded = True

_oembed_providers = None

def _loadOEmbedProviders(self):
"""
Loads the oEmbed providers JSON if not already loaded.
Returns the providers list.
"""
if self._oembed_providers is None:
try:
providers_url = "https://oembed.com/providers.json"
response = utils.web.getUrl(providers_url)
self._oembed_providers = json.loads(response)
except Exception as e:
self.log.debug(f"Failed to load oEmbed providers: {e}")
self._oembed_providers = []
return self._oembed_providers

def noIgnore(self, irc, msg):
return not self.registryValue('checkIgnored', msg.channel, irc.network)

Expand Down Expand Up @@ -264,6 +281,55 @@ def url_workaround(url):
'to have no HTML title within the first %S.',
url, size)

def _getOEmbedEndpoint(self, url):
"""
Finds the appropriate oEmbed endpoint for the given URL.
First tries the providers registry if enabled, then falls back to
HTML discovery if needed and enabled.
"""
if self.registryValue('useOembedRegistry'):
providers = self._loadOEmbedProviders()
for provider in providers:
for pattern in provider.get('endpoints', []):
schemes = pattern.get('schemes', [])
endpoint = pattern.get('url', '')
for scheme in schemes:
regex = re.escape(scheme).replace(r'\*', '.*')
if re.match(regex, url):
return endpoint
if self.registryValue('useOembedDiscovery'):
try:
timeout = self.registryValue('timeout')
response = utils.web.getUrl(url, timeout=timeout)
text = response.decode('utf8', errors='replace')
match = re.search(
r'<link[^>]+?type="application/json\+oembed"[^>]+?href="([^"]+)"',
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's insufficient, it does not cover different attribute orders or single quotes.

text,
re.IGNORECASE)
if match:
endpoint = match.group(1)
endpoint = endpoint.split('?')[0]
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why?

return endpoint
except Exception as e:
self.log.debug(f"Failed to discover oEmbed endpoint in HTML: {e}")
return None

def getOEmbedTitle(self, url):
"""
Retrieves the oEmbed title.
"""
try:
oembed_endpoint = self._getOEmbedEndpoint(url)
if not oembed_endpoint:
return None
oembed_url = f"{oembed_endpoint}?format=json&url={url}"
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't you need to escape the URL? (use urllib.parse.urlunparse)

response = utils.web.getUrl(oembed_url)
oembed_data = json.loads(response)
return oembed_data.get('title')
except Exception as e:
self.log.debug(f"Failed to retrieve oEmbed title: {e}")
return None

@fetch_sandbox
def titleSnarfer(self, irc, msg, match):
channel = msg.channel
Expand All @@ -280,10 +346,13 @@ def titleSnarfer(self, irc, msg, match):
if r and r.search(url):
self.log.debug('Not titleSnarfing %q.', url)
return
r = self.getTitle(irc, url, False, msg)
if not r:
return
(target, title) = r
title = self.getOEmbedTitle(url)
target = url
if not title:
r = self.getTitle(irc, url, False, msg)
if not r:
return
(target, title) = r
if title:
domain = utils.web.getDomain(target
if self.registryValue('snarferShowTargetDomain',
Expand Down Expand Up @@ -420,10 +489,13 @@ def title(self, irc, msg, args, optlist, url):
if not self._checkURLWhitelist(url):
irc.error("This url is not on the whitelist.")
return
r = self.getTitle(irc, url, True, msg)
if not r:
return
(target, title) = r
title = self.getOEmbedTitle(url)
target = url
if not title:
r = self.getTitle(irc, url, True, msg)
if not r:
return
(target, title) = r
if title:
if not [y for x,y in optlist if x == 'no-filter']:
for i in range(1, 4):
Expand Down
25 changes: 25 additions & 0 deletions plugins/Web/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,31 @@ def testWhitelist(self):
conf.supybot.plugins.Web.urlWhitelist.set('')
conf.supybot.plugins.Web.fetch.maximum.set(fm)

def testtitleOembedRegistry(self):
try:
conf.supybot.plugins.Web.useOembedRegistry.setValue(True)
self.assertResponse(
'title https://www.flickr.com/photos/bees/2362225867/',
'Bacon Lollys')
finally:
conf.supybot.plugins.Web.useOembedRegistry.setValue(False)

def testtitleOembedDiscovery(self):
try:
conf.supybot.plugins.Web.useOembedDiscovery.setValue(True)
self.assertResponse(
'title https://flickr.com/photos/bees/2362225867/',
'Bacon Lollys')
finally:
conf.supybot.plugins.Web.useOembedDiscovery.setValue(False)

def testtitleOembedError(self):
try:
conf.supybot.plugins.Web.useOembedDiscovery.setValue(True)
self.assertError('title https://nonexistent.example.com/post/123')
finally:
conf.supybot.plugins.Web.useOembedDiscovery.setValue(False)
Comment on lines +182 to +205
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def testtitleOembedRegistry(self):
try:
conf.supybot.plugins.Web.useOembedRegistry.setValue(True)
self.assertResponse(
'title https://www.flickr.com/photos/bees/2362225867/',
'Bacon Lollys')
finally:
conf.supybot.plugins.Web.useOembedRegistry.setValue(False)
def testtitleOembedDiscovery(self):
try:
conf.supybot.plugins.Web.useOembedDiscovery.setValue(True)
self.assertResponse(
'title https://flickr.com/photos/bees/2362225867/',
'Bacon Lollys')
finally:
conf.supybot.plugins.Web.useOembedDiscovery.setValue(False)
def testtitleOembedError(self):
try:
conf.supybot.plugins.Web.useOembedDiscovery.setValue(True)
self.assertError('title https://nonexistent.example.com/post/123')
finally:
conf.supybot.plugins.Web.useOembedDiscovery.setValue(False)
def testtitleOembedRegistry(self):
with conf.supybot.plugins.Web.useOembedRegistry.context(True):
self.assertResponse(
'title https://www.flickr.com/photos/bees/2362225867/',
'Bacon Lollys')
def testtitleOembedDiscovery(self):
with conf.supybot.plugins.Web.useOembedDiscovery.context(True):
self.assertResponse(
'title https://flickr.com/photos/bees/2362225867/',
'Bacon Lollys')
def testtitleOembedError(self):
with conf.supybot.plugins.Web.useOembedDiscovery.context(True):
self.assertError('title https://nonexistent.example.com/post/123')

use the new style, it's shorter


def testNonSnarfingRegexpConfigurable(self):
self.assertSnarfNoResponse('http://foo.bar.baz/', 2)
try:
Expand Down