Skip to content

Commit 86234dd

Browse files
brondsemKenton Taylor
authored andcommitted
[#8599] detect potentially misleading links and show the destination domain afterwards
1 parent 9dcdd1f commit 86234dd

File tree

4 files changed

+269
-3
lines changed

4 files changed

+269
-3
lines changed

Allura/allura/lib/app_globals.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ def cached_convert(self, artifact: MappedClass, field_name: str) -> Markup:
135135
field_name, artifact.__class__.__name__)
136136
return self.convert(source_text)
137137

138-
bugfix_rev = 6 # increment this if we need all caches to invalidated (e.g. xss in markdown rendering fixed)
138+
bugfix_rev = 7 # increment this if we need all caches to invalidated (e.g. xss in markdown rendering fixed)
139139
md5 = None
140140
# If a cached version exists and it is valid, return it.
141141
if cache.md5 is not None:

Allura/allura/lib/utils.py

Lines changed: 163 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import random
3030
import mimetypes
3131
import re
32+
import unicodedata
3233
from pathlib import Path
3334
from typing import TypeVar
3435
import magic
@@ -47,6 +48,7 @@
4748
from collections import OrderedDict
4849

4950
from bs4 import BeautifulSoup
51+
import idna
5052
from tg import redirect, app_globals as g
5153
from tg.decorators import before_validate
5254
from tg.controllers.util import etag_cache
@@ -588,6 +590,16 @@ def __init__(self, *args, **kwargs):
588590
'fn:', 'fnref:', # from footnotes extension
589591
} | set(aslist(tg.config.get('safe_html.id_prefixes', [])))
590592
self._prev_token_was_ok_iframe = False
593+
self._current_link = None
594+
self._pending_link_suffix = None
595+
596+
def __iter__(self):
597+
# override to inject link suffixes in the output
598+
for token in super().__iter__():
599+
yield token
600+
if self._pending_link_suffix is not None:
601+
yield self._pending_link_suffix
602+
self._pending_link_suffix = None
591603

592604
def sanitize_token(self, token):
593605
"""
@@ -633,7 +645,157 @@ def sanitize_token(self, token):
633645
if attrs.get((None, 'type'), '') == "checkbox":
634646
self.allowed_elements.add(input_el)
635647

636-
return super().sanitize_token(token)
648+
sanitized = super().sanitize_token(token)
649+
self._track_link_target(sanitized)
650+
return sanitized
651+
652+
def _track_link_target(self, token):
653+
if token is None:
654+
return
655+
656+
tag_type = token.get('type')
657+
tag_name = token.get('name')
658+
659+
if tag_type == 'StartTag' and tag_name == 'a':
660+
href = token.get('data', {}).get((None, 'href'))
661+
hostname = hostname_from_url(href)
662+
self._current_link = {
663+
'href': href,
664+
'hostname': hostname,
665+
'text_parts': [],
666+
}
667+
return
668+
669+
if self._current_link is None:
670+
return
671+
672+
if tag_type in ('Characters', 'SpaceCharacters'):
673+
self._current_link['text_parts'].append(token.get('data', ''))
674+
return
675+
676+
if tag_type == 'EndTag' and tag_name == 'a':
677+
self._finalize_link_target()
678+
self._current_link = None
679+
680+
def _finalize_link_target(self):
681+
href = self._current_link['href']
682+
hostname = self._current_link['hostname']
683+
if not href or not hostname:
684+
return
685+
686+
visible_text = ''.join(self._current_link['text_parts']).strip()
687+
if not visible_text:
688+
return
689+
690+
idn_lookalike = 'xn--' in hostname and idn_uses_lookalike_chars(hostname)
691+
if not idn_lookalike:
692+
# IDN with ASCII-lookalike chars (e.g. Cyrillic/Greek homographs) always
693+
# gets the suffix so the deception is visible; all other links only get it
694+
# when the visible text looks like a URL pointing somewhere different.
695+
if not text_contains_any_url(visible_text):
696+
return
697+
if text_contains_hostname(visible_text, hostname):
698+
return
699+
700+
self._pending_link_suffix = {
701+
'type': 'Characters',
702+
'data': f' ({hostname})',
703+
}
704+
705+
706+
# Scripts whose characters are visually distinct from ASCII and unlikely to be used
707+
# in homograph attacks. Any non-ASCII character whose Unicode name does NOT start
708+
# with one of these prefixes is treated as a potential ASCII lookalike.
709+
NON_MISLEADING_IDN_SCRIPTS = frozenset([
710+
'CJK', 'HANGUL', 'HIRAGANA', 'KATAKANA', # East Asian
711+
'ARABIC', 'HEBREW', # Semitic
712+
'THAI', 'LAO', 'KHMER', 'MYANMAR', 'TIBETAN', # Southeast/Central Asian
713+
'DEVANAGARI', 'BENGALI', 'GUJARATI', 'GURMUKHI', 'ORIYA', # South Asian (Indic)
714+
'KANNADA', 'MALAYALAM', 'SINHALA', 'TAMIL', 'TELUGU',
715+
'MONGOLIAN', 'ETHIOPIC', 'GEORGIAN',
716+
])
717+
718+
719+
def idn_uses_lookalike_chars(ascii_hostname: str) -> bool:
720+
"""
721+
Return True if any non-ASCII character in the IDN hostname could be visually
722+
confused with an ASCII character (e.g. Cyrillic or Greek lookalikes).
723+
"""
724+
try:
725+
unicode_hostname = idna.decode(ascii_hostname)
726+
except (idna.IDNAError, UnicodeError):
727+
return True # can't decode → assume misleading
728+
for char in unicode_hostname:
729+
if ord(char) <= 127:
730+
continue
731+
name = unicodedata.name(char, '')
732+
script = name.split()[0] if name else ''
733+
if script not in NON_MISLEADING_IDN_SCRIPTS:
734+
return True
735+
return False
736+
737+
738+
def normalize_hostname(hostname: str) -> str:
739+
# handles IDN -> ascii punycode
740+
if not hostname:
741+
return ''
742+
743+
hostname = hostname.strip().rstrip('.').lower()
744+
if not hostname:
745+
return ''
746+
747+
try:
748+
return idna.encode(hostname, uts46=True).decode('ascii').lower()
749+
except idna.IDNAError:
750+
return hostname
751+
752+
753+
def hostname_from_url(url: str) -> str:
754+
if not url:
755+
return ''
756+
# Browsers treat ///foo.com as //foo.com (protocol-relative); urlparse does not,
757+
# so collapse three or more leading slashes to two before parsing.
758+
url = re.sub(r'^/{3,}', '//', url)
759+
return normalize_hostname(urlparse(url).hostname)
760+
761+
762+
_HOST_TOKEN_RE = re.compile(r'[^\s<>()]+') # a run of non-whitespace, non <>() characters
763+
764+
765+
def text_contains_any_url(text: str) -> bool:
766+
"""Return True if the visible text appears to contain a URL or domain name."""
767+
for token in _HOST_TOKEN_RE.findall(text):
768+
candidate = token.strip('\'"[]{}()<>,;!?')
769+
if not candidate or ('@' in candidate and '://' not in candidate):
770+
continue
771+
if '://' in candidate:
772+
if urlparse(candidate).hostname:
773+
return True
774+
elif '.' in candidate and re.search(r'[a-zA-Z]', candidate):
775+
if urlparse('//' + candidate).hostname:
776+
return True
777+
return False
778+
779+
780+
def text_contains_hostname(text: str, hostname: str) -> bool:
781+
normalized_text = text.casefold()
782+
if hostname.casefold() in normalized_text:
783+
return True
784+
785+
for token in _HOST_TOKEN_RE.findall(text):
786+
candidate = token.strip('\'"[]{}()<>,;!?')
787+
if not candidate or ('@' in candidate and '://' not in candidate):
788+
continue
789+
790+
if '://' in candidate:
791+
candidate_hostname = urlparse(candidate).hostname
792+
else:
793+
candidate_hostname = urlparse('//' + candidate).hostname
794+
795+
if normalize_hostname(candidate_hostname) == hostname:
796+
return True
797+
798+
return False
637799

638800

639801
def ip_address(request):

Allura/allura/tests/test_utils.py

Lines changed: 104 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
from tg import config
3333
import html5lib
3434
import html5lib.treewalkers
35+
import html5lib.serializer
3536

3637
from alluratest.controller import setup_unit_test
3738

@@ -242,12 +243,20 @@ def greetings(self):
242243

243244
class TestHTMLSanitizer:
244245

246+
def setup_method(self, method):
247+
setup_unit_test()
248+
245249
def walker_from_text(self, text):
246250
parsed = html5lib.parseFragment(text)
247251
TreeWalker = html5lib.treewalkers.getTreeWalker("etree")
248252
walker = TreeWalker(parsed)
249253
return walker
250254

255+
def sanitize_html(self, html):
256+
with h.push_config(config, domain='mysite.com'):
257+
filt = utils.ForgeHTMLSanitizerFilter(self.walker_from_text(html))
258+
return html5lib.serializer.HTMLSerializer().render(filt)
259+
251260
def simple_tag_list(self, sanitizer):
252261
# no attrs, no close tag flag check, just real simple
253262
return [
@@ -285,6 +294,100 @@ def test_html_sanitizer_summary(self):
285294
p = utils.ForgeHTMLSanitizerFilter(walker)
286295
assert self.simple_tag_list(p) == ['details', 'summary', 'summary', 'ul', 'li', 'li', 'ul', 'details']
287296

297+
def test_misleading_links(self):
298+
# OK: generic text
299+
assert (self.sanitize_html('<a href="http://evil.com/">click here</a>') ==
300+
'<a href="http://evil.com/">click here</a>')
301+
302+
# OK: domain-like text matching actual link
303+
assert (self.sanitize_html('<a href="http://evil.com/path">evil.com</a>') ==
304+
'<a href="http://evil.com/path">evil.com</a>')
305+
306+
# SUFFIX: domain-like text that is a different domain
307+
assert (self.sanitize_html('<a href="http://evil.com/">example.com</a>') ==
308+
'<a href="http://evil.com/">example.com</a> (evil.com)')
309+
310+
# SUFFIX: domain with protocol, that is a different URL
311+
assert (self.sanitize_html('<a href="http://evil.com/">http://example.com/</a>') ==
312+
'<a href="http://evil.com/">http://example.com/</a> (evil.com)')
313+
314+
# SUFFIX: different protocol variations
315+
assert (self.sanitize_html('<a href="//evil.com/">http://example.com/</a>') ==
316+
'<a href="//evil.com/">http://example.com/</a> (evil.com)')
317+
assert (self.sanitize_html('<a href="///evil.com/">http://example.com/</a>') ==
318+
'<a href="///evil.com/">http://example.com/</a> (evil.com)')
319+
320+
# OK: URL text matching actual link
321+
assert (self.sanitize_html('<a href="http://evil.com/">http://evil.com/</a>') ==
322+
'<a href="http://evil.com/">http://evil.com/</a>')
323+
324+
# false-positive SUFFIX due to domain name detection .txt looks like a TLD :(
325+
assert (self.sanitize_html('<a href="https://example.com/repo/README.txt">README.txt</a>') ==
326+
'<a href="https://example.com/repo/README.txt">README.txt</a> (example.com)')
327+
328+
# OK: internal/relative links
329+
assert (self.sanitize_html('<a href="/p/local/wiki/">example.com</a>') ==
330+
'<a href="/p/local/wiki/">example.com</a>')
331+
332+
# SUFFIX: mismatch domain embedded in a longer sentence
333+
assert (self.sanitize_html('<a href="http://evil.com/">visit example.com for more</a>') ==
334+
'<a href="http://evil.com/">visit example.com for more</a> (evil.com)')
335+
336+
# SUFFIX: mismatch domain with path in longer sentence
337+
assert (self.sanitize_html('<a href="http://evil.com/">see example.com/some/page for details</a>') ==
338+
'<a href="http://evil.com/">see example.com/some/page for details</a> (evil.com)')
339+
340+
# SUFFIX: mismatch full URL in longer sentence
341+
assert (self.sanitize_html('<a href="http://evil.com/">see http://example.com/some/page for details</a>') ==
342+
'<a href="http://evil.com/">see http://example.com/some/page for details</a> (evil.com)')
343+
344+
def test_misleading_links_idn(self):
345+
# аpple.com with Cyrillic а (U+0430) — homograph of apple.com
346+
cyrillic_a = '\u0430'
347+
348+
# text and href use same IDN domain: always show punycode suffix for IDN lookalike destinations
349+
assert (self.sanitize_html(f'<a href="http://{cyrillic_a}pple.com/">{cyrillic_a}pple.com</a>') ==
350+
f'<a href="http://{cyrillic_a}pple.com/">{cyrillic_a}pple.com</a> (xn--pple-43d.com)')
351+
352+
# lookalike unicode text linking to the real domain: suffix reveals real hostname
353+
assert (self.sanitize_html(f'<a href="http://apple.com/">{cyrillic_a}pple.com</a>') ==
354+
f'<a href="http://apple.com/">{cyrillic_a}pple.com</a> (apple.com)')
355+
356+
# real-looking text linking to IDN domain: suffix reveals punycode hostname
357+
assert (self.sanitize_html(f'<a href="http://{cyrillic_a}pple.com/">apple.com</a>') ==
358+
f'<a href="http://{cyrillic_a}pple.com/">apple.com</a> (xn--pple-43d.com)')
359+
360+
# even generic link text gets the suffix when destination is IDN lookalike
361+
assert (self.sanitize_html(f'<a href="http://{cyrillic_a}pple.com/">click here</a>') ==
362+
f'<a href="http://{cyrillic_a}pple.com/">click here</a> (xn--pple-43d.com)')
363+
364+
# Chinese IDN — not an ASCII lookalike, so treated like a normal domain
365+
chinese_domain = '中文.com'
366+
# generic text: no suffix (Chinese chars are not ASCII lookalikes)
367+
assert (self.sanitize_html(f'<a href="http://{chinese_domain}/">click here</a>') ==
368+
'<a href="http://中文.com/">click here</a>')
369+
# misleading domain text: suffix shown (same as non-IDN behavior)
370+
assert (self.sanitize_html(f'<a href="http://{chinese_domain}/">example.com</a>') ==
371+
'<a href="http://中文.com/">example.com</a> (xn--fiq228c.com)')
372+
# correct domain text: no suffix
373+
assert (self.sanitize_html(f'<a href="http://{chinese_domain}/">{chinese_domain}</a>') ==
374+
'<a href="http://中文.com/">中文.com</a>')
375+
376+
377+
def test_text_contains_any_url():
378+
assert utils.text_contains_any_url('example.com') is True
379+
assert utils.text_contains_any_url('foo a1-2-3.space bar') is True
380+
assert utils.text_contains_any_url('click here') is False
381+
# false positives: file extensions and library names with dots parse as hostnames
382+
assert utils.text_contains_any_url('README.md') is True # .md suffix
383+
assert utils.text_contains_any_url('Node.js') is True # .js suffix
384+
385+
386+
def test_text_contains_hostname():
387+
assert utils.text_contains_hostname('visit example.com for info', 'example.com') is True
388+
assert utils.text_contains_hostname('visit example.com for info', 'example.comevil.org') is False
389+
assert utils.text_contains_hostname('click here', 'example.com') is False
390+
288391

289392
def test_ip_address():
290393
req = Mock()
@@ -434,4 +537,4 @@ def test_join_paths_no_traversal(base, paths, expected: str | type[Exception]):
434537
def test_hide_email():
435538
assert utils.hide_email('foo@bar.com') == '<fo...@ba...>'
436539
assert utils.hide_email('email@example.com') == '<em...@ex...>'
437-
assert utils.hide_email('email@example') == '<em...@ex...>'
540+
assert utils.hide_email('email@example') == '<em...@ex...>'

requirements.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ feedparser
1515
FormEncode
1616
GitPython
1717
html5lib
18+
idna
1819
Jinja2
1920
# Webob uses legacy-cgi and so our tests need it too for cgi.FieldStorage, maybe webob 2 will remove it? https://github.com/Pylons/webob/pull/466
2021
legacy-cgi ; python_full_version >= '3.13'

0 commit comments

Comments
 (0)