|
29 | 29 | import random |
30 | 30 | import mimetypes |
31 | 31 | import re |
| 32 | +import unicodedata |
32 | 33 | from pathlib import Path |
33 | 34 | from typing import TypeVar |
34 | 35 | import magic |
|
47 | 48 | from collections import OrderedDict |
48 | 49 |
|
49 | 50 | from bs4 import BeautifulSoup |
| 51 | +import idna |
50 | 52 | from tg import redirect, app_globals as g |
51 | 53 | from tg.decorators import before_validate |
52 | 54 | from tg.controllers.util import etag_cache |
@@ -588,6 +590,16 @@ def __init__(self, *args, **kwargs): |
588 | 590 | 'fn:', 'fnref:', # from footnotes extension |
589 | 591 | } | set(aslist(tg.config.get('safe_html.id_prefixes', []))) |
590 | 592 | self._prev_token_was_ok_iframe = False |
| 593 | + self._current_link = None |
| 594 | + self._pending_link_suffix = None |
| 595 | + |
| 596 | + def __iter__(self): |
| 597 | + # override to inject link suffixes in the output |
| 598 | + for token in super().__iter__(): |
| 599 | + yield token |
| 600 | + if self._pending_link_suffix is not None: |
| 601 | + yield self._pending_link_suffix |
| 602 | + self._pending_link_suffix = None |
591 | 603 |
|
592 | 604 | def sanitize_token(self, token): |
593 | 605 | """ |
@@ -633,7 +645,157 @@ def sanitize_token(self, token): |
633 | 645 | if attrs.get((None, 'type'), '') == "checkbox": |
634 | 646 | self.allowed_elements.add(input_el) |
635 | 647 |
|
636 | | - return super().sanitize_token(token) |
| 648 | + sanitized = super().sanitize_token(token) |
| 649 | + self._track_link_target(sanitized) |
| 650 | + return sanitized |
| 651 | + |
| 652 | + def _track_link_target(self, token): |
| 653 | + if token is None: |
| 654 | + return |
| 655 | + |
| 656 | + tag_type = token.get('type') |
| 657 | + tag_name = token.get('name') |
| 658 | + |
| 659 | + if tag_type == 'StartTag' and tag_name == 'a': |
| 660 | + href = token.get('data', {}).get((None, 'href')) |
| 661 | + hostname = hostname_from_url(href) |
| 662 | + self._current_link = { |
| 663 | + 'href': href, |
| 664 | + 'hostname': hostname, |
| 665 | + 'text_parts': [], |
| 666 | + } |
| 667 | + return |
| 668 | + |
| 669 | + if self._current_link is None: |
| 670 | + return |
| 671 | + |
| 672 | + if tag_type in ('Characters', 'SpaceCharacters'): |
| 673 | + self._current_link['text_parts'].append(token.get('data', '')) |
| 674 | + return |
| 675 | + |
| 676 | + if tag_type == 'EndTag' and tag_name == 'a': |
| 677 | + self._finalize_link_target() |
| 678 | + self._current_link = None |
| 679 | + |
| 680 | + def _finalize_link_target(self): |
| 681 | + href = self._current_link['href'] |
| 682 | + hostname = self._current_link['hostname'] |
| 683 | + if not href or not hostname: |
| 684 | + return |
| 685 | + |
| 686 | + visible_text = ''.join(self._current_link['text_parts']).strip() |
| 687 | + if not visible_text: |
| 688 | + return |
| 689 | + |
| 690 | + idn_lookalike = 'xn--' in hostname and idn_uses_lookalike_chars(hostname) |
| 691 | + if not idn_lookalike: |
| 692 | + # IDN with ASCII-lookalike chars (e.g. Cyrillic/Greek homographs) always |
| 693 | + # gets the suffix so the deception is visible; all other links only get it |
| 694 | + # when the visible text looks like a URL pointing somewhere different. |
| 695 | + if not text_contains_any_url(visible_text): |
| 696 | + return |
| 697 | + if text_contains_hostname(visible_text, hostname): |
| 698 | + return |
| 699 | + |
| 700 | + self._pending_link_suffix = { |
| 701 | + 'type': 'Characters', |
| 702 | + 'data': f' ({hostname})', |
| 703 | + } |
| 704 | + |
| 705 | + |
| 706 | +# Scripts whose characters are visually distinct from ASCII and unlikely to be used |
| 707 | +# in homograph attacks. Any non-ASCII character whose Unicode name does NOT start |
| 708 | +# with one of these prefixes is treated as a potential ASCII lookalike. |
| 709 | +NON_MISLEADING_IDN_SCRIPTS = frozenset([ |
| 710 | + 'CJK', 'HANGUL', 'HIRAGANA', 'KATAKANA', # East Asian |
| 711 | + 'ARABIC', 'HEBREW', # Semitic |
| 712 | + 'THAI', 'LAO', 'KHMER', 'MYANMAR', 'TIBETAN', # Southeast/Central Asian |
| 713 | + 'DEVANAGARI', 'BENGALI', 'GUJARATI', 'GURMUKHI', 'ORIYA', # South Asian (Indic) |
| 714 | + 'KANNADA', 'MALAYALAM', 'SINHALA', 'TAMIL', 'TELUGU', |
| 715 | + 'MONGOLIAN', 'ETHIOPIC', 'GEORGIAN', |
| 716 | +]) |
| 717 | + |
| 718 | + |
| 719 | +def idn_uses_lookalike_chars(ascii_hostname: str) -> bool: |
| 720 | + """ |
| 721 | + Return True if any non-ASCII character in the IDN hostname could be visually |
| 722 | + confused with an ASCII character (e.g. Cyrillic or Greek lookalikes). |
| 723 | + """ |
| 724 | + try: |
| 725 | + unicode_hostname = idna.decode(ascii_hostname) |
| 726 | + except (idna.IDNAError, UnicodeError): |
| 727 | + return True # can't decode → assume misleading |
| 728 | + for char in unicode_hostname: |
| 729 | + if ord(char) <= 127: |
| 730 | + continue |
| 731 | + name = unicodedata.name(char, '') |
| 732 | + script = name.split()[0] if name else '' |
| 733 | + if script not in NON_MISLEADING_IDN_SCRIPTS: |
| 734 | + return True |
| 735 | + return False |
| 736 | + |
| 737 | + |
| 738 | +def normalize_hostname(hostname: str) -> str: |
| 739 | + # handles IDN -> ascii punycode |
| 740 | + if not hostname: |
| 741 | + return '' |
| 742 | + |
| 743 | + hostname = hostname.strip().rstrip('.').lower() |
| 744 | + if not hostname: |
| 745 | + return '' |
| 746 | + |
| 747 | + try: |
| 748 | + return idna.encode(hostname, uts46=True).decode('ascii').lower() |
| 749 | + except idna.IDNAError: |
| 750 | + return hostname |
| 751 | + |
| 752 | + |
| 753 | +def hostname_from_url(url: str) -> str: |
| 754 | + if not url: |
| 755 | + return '' |
| 756 | + # Browsers treat ///foo.com as //foo.com (protocol-relative); urlparse does not, |
| 757 | + # so collapse three or more leading slashes to two before parsing. |
| 758 | + url = re.sub(r'^/{3,}', '//', url) |
| 759 | + return normalize_hostname(urlparse(url).hostname) |
| 760 | + |
| 761 | + |
| 762 | +_HOST_TOKEN_RE = re.compile(r'[^\s<>()]+') # a run of non-whitespace, non <>() characters |
| 763 | + |
| 764 | + |
| 765 | +def text_contains_any_url(text: str) -> bool: |
| 766 | + """Return True if the visible text appears to contain a URL or domain name.""" |
| 767 | + for token in _HOST_TOKEN_RE.findall(text): |
| 768 | + candidate = token.strip('\'"[]{}()<>,;!?') |
| 769 | + if not candidate or ('@' in candidate and '://' not in candidate): |
| 770 | + continue |
| 771 | + if '://' in candidate: |
| 772 | + if urlparse(candidate).hostname: |
| 773 | + return True |
| 774 | + elif '.' in candidate and re.search(r'[a-zA-Z]', candidate): |
| 775 | + if urlparse('//' + candidate).hostname: |
| 776 | + return True |
| 777 | + return False |
| 778 | + |
| 779 | + |
| 780 | +def text_contains_hostname(text: str, hostname: str) -> bool: |
| 781 | + normalized_text = text.casefold() |
| 782 | + if hostname.casefold() in normalized_text: |
| 783 | + return True |
| 784 | + |
| 785 | + for token in _HOST_TOKEN_RE.findall(text): |
| 786 | + candidate = token.strip('\'"[]{}()<>,;!?') |
| 787 | + if not candidate or ('@' in candidate and '://' not in candidate): |
| 788 | + continue |
| 789 | + |
| 790 | + if '://' in candidate: |
| 791 | + candidate_hostname = urlparse(candidate).hostname |
| 792 | + else: |
| 793 | + candidate_hostname = urlparse('//' + candidate).hostname |
| 794 | + |
| 795 | + if normalize_hostname(candidate_hostname) == hostname: |
| 796 | + return True |
| 797 | + |
| 798 | + return False |
637 | 799 |
|
638 | 800 |
|
639 | 801 | def ip_address(request): |
|
0 commit comments