[#8599] detect potentially misleading links and show the destination domain afterwards

brondsem · Kenton Taylor · commit 86234dd06bd2 · 2026-04-01T15:12:25.000Z
diff --git a/Allura/allura/lib/app_globals.py b/Allura/allura/lib/app_globals.py
@@ -135,7 +135,7 @@ def cached_convert(self, artifact: MappedClass, field_name: str) -> Markup:
                 field_name, artifact.__class__.__name__)
             return self.convert(source_text)
 
-        bugfix_rev = 6  # increment this if we need all caches to invalidated (e.g. xss in markdown rendering fixed)
+        bugfix_rev = 7  # increment this if we need all caches to invalidated (e.g. xss in markdown rendering fixed)
         md5 = None
         # If a cached version exists and it is valid, return it.
         if cache.md5 is not None:
diff --git a/Allura/allura/lib/utils.py b/Allura/allura/lib/utils.py
@@ -29,6 +29,7 @@
 import random
 import mimetypes
 import re
+import unicodedata
 from pathlib import Path
 from typing import TypeVar
 import magic
@@ -47,6 +48,7 @@
 from collections import OrderedDict
 
 from bs4 import BeautifulSoup
+import idna
 from tg import redirect, app_globals as g
 from tg.decorators import before_validate
 from tg.controllers.util import etag_cache
@@ -588,6 +590,16 @@ def __init__(self, *args, **kwargs):
             'fn:', 'fnref:',  # from footnotes extension
         } | set(aslist(tg.config.get('safe_html.id_prefixes', [])))
         self._prev_token_was_ok_iframe = False
+        self._current_link = None
+        self._pending_link_suffix = None
+
+    def __iter__(self):
+        # override to inject link suffixes in the output
+        for token in super().__iter__():
+            yield token
+            if self._pending_link_suffix is not None:
+                yield self._pending_link_suffix
+                self._pending_link_suffix = None
 
     def sanitize_token(self, token):
         """
@@ -633,7 +645,157 @@ def sanitize_token(self, token):
             if attrs.get((None, 'type'), '') == "checkbox":
                 self.allowed_elements.add(input_el)
 
-        return super().sanitize_token(token)
+        sanitized = super().sanitize_token(token)
+        self._track_link_target(sanitized)
+        return sanitized
+
+    def _track_link_target(self, token):
+        if token is None:
+            return
+
+        tag_type = token.get('type')
+        tag_name = token.get('name')
+
+        if tag_type == 'StartTag' and tag_name == 'a':
+            href = token.get('data', {}).get((None, 'href'))
+            hostname = hostname_from_url(href)
+            self._current_link = {
+                'href': href,
+                'hostname': hostname,
+                'text_parts': [],
+            }
+            return
+
+        if self._current_link is None:
+            return
+
+        if tag_type in ('Characters', 'SpaceCharacters'):
+            self._current_link['text_parts'].append(token.get('data', ''))
+            return
+
+        if tag_type == 'EndTag' and tag_name == 'a':
+            self._finalize_link_target()
+            self._current_link = None
+
+    def _finalize_link_target(self):
+        href = self._current_link['href']
+        hostname = self._current_link['hostname']
+        if not href or not hostname:
+            return
+
+        visible_text = ''.join(self._current_link['text_parts']).strip()
+        if not visible_text:
+            return
+
+        idn_lookalike = 'xn--' in hostname and idn_uses_lookalike_chars(hostname)
+        if not idn_lookalike:
+            # IDN with ASCII-lookalike chars (e.g. Cyrillic/Greek homographs) always
+            # gets the suffix so the deception is visible; all other links only get it
+            # when the visible text looks like a URL pointing somewhere different.
+            if not text_contains_any_url(visible_text):
+                return
+            if text_contains_hostname(visible_text, hostname):
+                return
+
+        self._pending_link_suffix = {
+            'type': 'Characters',
+            'data': f' ({hostname})',
+        }
+
+
+# Scripts whose characters are visually distinct from ASCII and unlikely to be used
+# in homograph attacks.  Any non-ASCII character whose Unicode name does NOT start
+# with one of these prefixes is treated as a potential ASCII lookalike.
+NON_MISLEADING_IDN_SCRIPTS = frozenset([
+    'CJK', 'HANGUL', 'HIRAGANA', 'KATAKANA',                          # East Asian
+    'ARABIC', 'HEBREW',                                               # Semitic
+    'THAI', 'LAO', 'KHMER', 'MYANMAR', 'TIBETAN',                     # Southeast/Central Asian
+    'DEVANAGARI', 'BENGALI', 'GUJARATI', 'GURMUKHI', 'ORIYA',         # South Asian (Indic)
+    'KANNADA', 'MALAYALAM', 'SINHALA', 'TAMIL', 'TELUGU',
+    'MONGOLIAN', 'ETHIOPIC', 'GEORGIAN',
+])
+
+
+def idn_uses_lookalike_chars(ascii_hostname: str) -> bool:
+    """
+    Return True if any non-ASCII character in the IDN hostname could be visually
+    confused with an ASCII character (e.g. Cyrillic or Greek lookalikes).
+    """
+    try:
+        unicode_hostname = idna.decode(ascii_hostname)
+    except (idna.IDNAError, UnicodeError):
+        return True  # can't decode → assume misleading
+    for char in unicode_hostname:
+        if ord(char) <= 127:
+            continue
+        name = unicodedata.name(char, '')
+        script = name.split()[0] if name else ''
+        if script not in NON_MISLEADING_IDN_SCRIPTS:
+            return True
+    return False
+
+
+def normalize_hostname(hostname: str) -> str:
+    # handles IDN -> ascii punycode
+    if not hostname:
+        return ''
+
+    hostname = hostname.strip().rstrip('.').lower()
+    if not hostname:
+        return ''
+
+    try:
+        return idna.encode(hostname, uts46=True).decode('ascii').lower()
+    except idna.IDNAError:
+        return hostname
+
+
+def hostname_from_url(url: str) -> str:
+    if not url:
+        return ''
+    # Browsers treat ///foo.com as //foo.com (protocol-relative); urlparse does not,
+    # so collapse three or more leading slashes to two before parsing.
+    url = re.sub(r'^/{3,}', '//', url)
+    return normalize_hostname(urlparse(url).hostname)
+
+
+_HOST_TOKEN_RE = re.compile(r'[^\s<>()]+')  # a run of non-whitespace, non <>() characters
+
+
+def text_contains_any_url(text: str) -> bool:
+    """Return True if the visible text appears to contain a URL or domain name."""
+    for token in _HOST_TOKEN_RE.findall(text):
+        candidate = token.strip('\'"[]{}()<>,;!?')
+        if not candidate or ('@' in candidate and '://' not in candidate):
+            continue
+        if '://' in candidate:
+            if urlparse(candidate).hostname:
+                return True
+        elif '.' in candidate and re.search(r'[a-zA-Z]', candidate):
+            if urlparse('//' + candidate).hostname:
+                return True
+    return False
+
+
+def text_contains_hostname(text: str, hostname: str) -> bool:
+    normalized_text = text.casefold()
+    if hostname.casefold() in normalized_text:
+        return True
+
+    for token in _HOST_TOKEN_RE.findall(text):
+        candidate = token.strip('\'"[]{}()<>,;!?')
+        if not candidate or ('@' in candidate and '://' not in candidate):
+            continue
+
+        if '://' in candidate:
+            candidate_hostname = urlparse(candidate).hostname
+        else:
+            candidate_hostname = urlparse('//' + candidate).hostname
+
+        if normalize_hostname(candidate_hostname) == hostname:
+            return True
+
+    return False
 
 
 def ip_address(request):
diff --git a/Allura/allura/tests/test_utils.py b/Allura/allura/tests/test_utils.py
@@ -32,6 +32,7 @@
 from tg import config
 import html5lib
 import html5lib.treewalkers
+import html5lib.serializer
 
 from alluratest.controller import setup_unit_test
 
@@ -242,12 +243,20 @@ def greetings(self):
 
 class TestHTMLSanitizer:
 
+    def setup_method(self, method):
+        setup_unit_test()
+
     def walker_from_text(self, text):
         parsed = html5lib.parseFragment(text)
         TreeWalker = html5lib.treewalkers.getTreeWalker("etree")
         walker = TreeWalker(parsed)
         return walker
 
+    def sanitize_html(self, html):
+        with h.push_config(config, domain='mysite.com'):
+            filt = utils.ForgeHTMLSanitizerFilter(self.walker_from_text(html))
+            return html5lib.serializer.HTMLSerializer().render(filt)
+
     def simple_tag_list(self, sanitizer):
         # no attrs, no close tag flag check, just real simple
         return [
@@ -285,6 +294,100 @@ def test_html_sanitizer_summary(self):
         p = utils.ForgeHTMLSanitizerFilter(walker)
         assert self.simple_tag_list(p) == ['details', 'summary', 'summary', 'ul', 'li', 'li', 'ul', 'details']
 
+    def test_misleading_links(self):
+        # OK: generic text
+        assert (self.sanitize_html('<a href="http://evil.com/">click here</a>') ==
+                '<a href="http://evil.com/">click here</a>')
+
+        # OK: domain-like text matching actual link
+        assert (self.sanitize_html('<a href="http://evil.com/path">evil.com</a>') ==
+                '<a href="http://evil.com/path">evil.com</a>')
+
+        # SUFFIX: domain-like text that is a different domain
+        assert (self.sanitize_html('<a href="http://evil.com/">example.com</a>') ==
+                '<a href="http://evil.com/">example.com</a> (evil.com)')
+
+        # SUFFIX: domain with protocol, that is a different URL
+        assert (self.sanitize_html('<a href="http://evil.com/">http://example.com/</a>') ==
+                '<a href="http://evil.com/">http://example.com/</a> (evil.com)')
+
+        # SUFFIX: different protocol variations
+        assert (self.sanitize_html('<a href="//evil.com/">http://example.com/</a>') ==
+                '<a href="//evil.com/">http://example.com/</a> (evil.com)')
+        assert (self.sanitize_html('<a href="///evil.com/">http://example.com/</a>') ==
+                '<a href="///evil.com/">http://example.com/</a> (evil.com)')
+
+        # OK: URL text matching actual link
+        assert (self.sanitize_html('<a href="http://evil.com/">http://evil.com/</a>') ==
+                '<a href="http://evil.com/">http://evil.com/</a>')
+
+        # false-positive SUFFIX due to domain name detection .txt looks like a TLD :(
+        assert (self.sanitize_html('<a href="https://example.com/repo/README.txt">README.txt</a>') ==
+                '<a href="https://example.com/repo/README.txt">README.txt</a> (example.com)')
+
+        # OK: internal/relative links
+        assert (self.sanitize_html('<a href="/p/local/wiki/">example.com</a>') ==
+                '<a href="/p/local/wiki/">example.com</a>')
+
+        # SUFFIX: mismatch domain embedded in a longer sentence
+        assert (self.sanitize_html('<a href="http://evil.com/">visit example.com for more</a>') ==
+                '<a href="http://evil.com/">visit example.com for more</a> (evil.com)')
+
+        # SUFFIX: mismatch domain with path in longer sentence
+        assert (self.sanitize_html('<a href="http://evil.com/">see example.com/some/page for details</a>') ==
+                '<a href="http://evil.com/">see example.com/some/page for details</a> (evil.com)')
+
+        # SUFFIX: mismatch full URL in longer sentence
+        assert (self.sanitize_html('<a href="http://evil.com/">see http://example.com/some/page for details</a>') ==
+                '<a href="http://evil.com/">see http://example.com/some/page for details</a> (evil.com)')
+
+    def test_misleading_links_idn(self):
+        # аpple.com with Cyrillic а (U+0430) — homograph of apple.com
+        cyrillic_a = '\u0430'
+
+        # text and href use same IDN domain: always show punycode suffix for IDN lookalike destinations
+        assert (self.sanitize_html(f'<a href="http://{cyrillic_a}pple.com/">{cyrillic_a}pple.com</a>') ==
+                f'<a href="http://{cyrillic_a}pple.com/">{cyrillic_a}pple.com</a> (xn--pple-43d.com)')
+
+        # lookalike unicode text linking to the real domain: suffix reveals real hostname
+        assert (self.sanitize_html(f'<a href="http://apple.com/">{cyrillic_a}pple.com</a>') ==
+                f'<a href="http://apple.com/">{cyrillic_a}pple.com</a> (apple.com)')
+
+        # real-looking text linking to IDN domain: suffix reveals punycode hostname
+        assert (self.sanitize_html(f'<a href="http://{cyrillic_a}pple.com/">apple.com</a>') ==
+                f'<a href="http://{cyrillic_a}pple.com/">apple.com</a> (xn--pple-43d.com)')
+
+        # even generic link text gets the suffix when destination is IDN lookalike
+        assert (self.sanitize_html(f'<a href="http://{cyrillic_a}pple.com/">click here</a>') ==
+                f'<a href="http://{cyrillic_a}pple.com/">click here</a> (xn--pple-43d.com)')
+
+        # Chinese IDN — not an ASCII lookalike, so treated like a normal domain
+        chinese_domain = '中文.com'
+        # generic text: no suffix (Chinese chars are not ASCII lookalikes)
+        assert (self.sanitize_html(f'<a href="http://{chinese_domain}/">click here</a>') ==
+                '<a href="http://中文.com/">click here</a>')
+        # misleading domain text: suffix shown (same as non-IDN behavior)
+        assert (self.sanitize_html(f'<a href="http://{chinese_domain}/">example.com</a>') ==
+                '<a href="http://中文.com/">example.com</a> (xn--fiq228c.com)')
+        # correct domain text: no suffix
+        assert (self.sanitize_html(f'<a href="http://{chinese_domain}/">{chinese_domain}</a>') ==
+                '<a href="http://中文.com/">中文.com</a>')
+
+
+def test_text_contains_any_url():
+    assert utils.text_contains_any_url('example.com') is True
+    assert utils.text_contains_any_url('foo a1-2-3.space bar') is True
+    assert utils.text_contains_any_url('click here') is False
+    # false positives: file extensions and library names with dots parse as hostnames
+    assert utils.text_contains_any_url('README.md') is True   # .md suffix
+    assert utils.text_contains_any_url('Node.js') is True     # .js suffix
+
+
+def test_text_contains_hostname():
+    assert utils.text_contains_hostname('visit example.com for info', 'example.com') is True
+    assert utils.text_contains_hostname('visit example.com for info', 'example.comevil.org') is False
+    assert utils.text_contains_hostname('click here', 'example.com') is False
+
 
 def test_ip_address():
     req = Mock()
@@ -434,4 +537,4 @@ def test_join_paths_no_traversal(base, paths, expected: str | type[Exception]):
 def test_hide_email():
     assert utils.hide_email('foo@bar.com') == '<fo...@ba...>'
     assert utils.hide_email('email@example.com') == '<em...@ex...>'
-    assert utils.hide_email('email@example') == '<em...@ex...>'
+    assert utils.hide_email('email@example') == '<em...@ex...>'
diff --git a/requirements.in b/requirements.in
@@ -15,6 +15,7 @@ feedparser
 FormEncode
 GitPython
 html5lib
+idna
 Jinja2
 # Webob uses legacy-cgi and so our tests need it too for cgi.FieldStorage, maybe webob 2 will remove it?  https://github.com/Pylons/webob/pull/466
 legacy-cgi ; python_full_version >= '3.13'