Skip to content

Commit b37dc23

Browse files
committed
Attempt to resync handle_starttag with MessageEntity.cpp (see #107).
1 parent ea55f1c commit b37dc23

File tree

2 files changed

+95
-14
lines changed

2 files changed

+95
-14
lines changed

metabot/util/html.py

Lines changed: 61 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -34,27 +34,75 @@ def sanitize(self, text, strip=False, length=None):
3434
'tg-emoji': 'emoji-id',
3535
}
3636

37-
def handle_starttag(self, tag, attrs):
37+
def handle_starttag(self, tag, attrs): # pylint: disable=too-many-branches,too-many-return-statements
3838
if self.__remaining is not None and not self.__remaining:
3939
return
4040
if tag in ('br', 'div', 'p'):
4141
self.__append('\n')
4242
return
43-
if self.__strip or tag not in self.naked and tag not in self.coupled:
43+
if self.__strip:
4444
return
45+
if tag in self.naked:
46+
self.__stack.append(tag)
47+
self.__pieces.append(f'<{tag}>')
48+
return
49+
if tag not in self.coupled:
50+
return
51+
52+
attrs = dict(attrs)
53+
attribute_name = self.coupled[tag]
54+
attribute_value = attrs.get(attribute_name, '')
55+
56+
# pylint: disable=line-too-long
57+
# if (tag_name == "a" && attribute_name == Slice("href")) {
58+
# argument = std::move(attribute_value);
59+
# } else if (tag_name == "code" && attribute_name == Slice("class") &&
60+
# begins_with(attribute_value, "language-")) {
61+
# argument = attribute_value.substr(9);
62+
# } else if (tag_name == "span" && attribute_name == Slice("class") && begins_with(attribute_value, "tg-")) {
63+
# argument = attribute_value.substr(3);
64+
# } else if (tag_name == "tg-emoji" && attribute_name == Slice("emoji-id")) {
65+
# argument = std::move(attribute_value);
66+
# } else if (tag_name == "blockquote" && attribute_name == Slice("expandable")) {
67+
# argument = "1";
68+
# }
69+
argument = None
70+
if tag == 'a':
71+
argument = attribute_value
72+
elif tag == 'code' and attribute_value.startswith('language-'):
73+
argument = attribute_value[9:]
74+
elif tag == 'span' and attribute_value.startswith('tg-'):
75+
argument = attribute_value[3:]
76+
elif tag == 'tg-emoji':
77+
argument = attribute_value
78+
elif tag == 'blockquote' and 'expandable' in attrs:
79+
argument = '1'
80+
81+
# if (tag_name == "span" && argument != "spoiler") {
82+
# return Status::Error(400, PSLICE()
83+
# << "Tag \"span\" must have class \"tg-spoiler\" at byte offset " << begin_pos);
84+
# }
85+
if tag == 'span' and argument != 'spoiler':
86+
return
87+
88+
# } else if (tag_name == "tg-emoji") {
89+
# auto r_document_id = to_integer_safe<int64>(nested_entities.back().argument);
90+
# if (r_document_id.is_error() || r_document_id.ok() == 0) {
91+
# return Status::Error(400, "Invalid custom emoji identifier specified");
92+
# }
93+
if tag == 'tg-emoji' and (not argument.isdigit() or not 0 < int(argument) < 2**63):
94+
return
95+
96+
if tag == 'a' and not argument:
97+
return
98+
4599
self.__stack.append(tag)
46-
if tag in self.coupled:
47-
attr = self.coupled[tag]
48-
attrs = dict(attrs)
49-
if attrs.get(attr):
50-
value = escape(attrs[attr])
51-
self.__pieces.append(f'<{tag} {attr}="{value}">')
52-
elif attr in attrs:
53-
self.__pieces.append(f'<{tag} {attr}>')
54-
else:
55-
self.__pieces.append(f'<{tag}>')
56-
else:
100+
if not argument:
57101
self.__pieces.append(f'<{tag}>')
102+
elif not attribute_value:
103+
self.__pieces.append(f'<{tag} {attribute_name}>')
104+
else:
105+
self.__pieces.append(f'<{tag} {attribute_name}="{escape(attribute_value)}">')
58106

59107
def handle_endtag(self, tag):
60108
while self.__stack:

metabot/util/test_html.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def test_sanitize():
3030
"""
3131
assert html.sanitize(text) == text
3232
assert html.sanitize('a<br>b<div>c</div><p>d</p>') == 'a\nb\nc\nd'
33-
assert html.sanitize('<a weird="true">text</a>') == '<a>text</a>'
33+
assert html.sanitize('<code weird="true">text</code>') == '<code>text</code>'
3434

3535
assert html.sanitize('<b>good</b> <bogus>bad</bogus>') == '<b>good</b> bad'
3636
assert html.sanitize('<b><bogus>nested</bogus></b>') == '<b>nested</b>'
@@ -116,3 +116,36 @@ def test_truncate():
116116
# but this would truncate to '&lt;2345<b>67</b>' because it counted '&lt;' as 4 instead of 1
117117
# while processing '&lt;2345':
118118
assert html.truncate('&lt;2345<b>678901234567890', 10) == '&lt;2345<b>67890</b>'
119+
120+
121+
def test_MessageEntity_cpp_quirks(): # pylint: disable=invalid-name
122+
"""Verify sync with https://github.com/tdlib/td/blob/master/td/telegram/MessageEntity.cpp."""
123+
124+
assert html.sanitize('<a foo=1 href=2 bar=3>b</a>') == '<a href="2">b</a>'
125+
# Technically, '<a>https://example.com/</a>' would be considered valid, but I don't want to
126+
# have to validate URLs.
127+
assert html.sanitize('<a foo=1 bar=3>b</a>') == 'b'
128+
129+
assert html.sanitize('<SPAN CLASS="tg-spoiler">x</span>') == '<span class="tg-spoiler">x</span>'
130+
assert html.sanitize('<span class="other">x</span>') == 'x'
131+
assert html.sanitize('<span>x</span>') == 'x'
132+
133+
assert html.sanitize('<blockquote>quote</blockquote>') == '<blockquote>quote</blockquote>'
134+
assert html.sanitize(
135+
'<blockquote other=1>quote</blockquote>') == '<blockquote>quote</blockquote>'
136+
assert html.sanitize(
137+
'<blockquote expandable>quote</blockquote>') == '<blockquote expandable>quote</blockquote>'
138+
assert html.sanitize('<blockquote expandable=true>quote</blockquote>'
139+
) == '<blockquote expandable="true">quote</blockquote>'
140+
141+
assert html.sanitize('<tg-emoji>emoji</tg-emoji>') == 'emoji'
142+
assert html.sanitize(
143+
'<tg-emoji emoji-id=1>emoji</tg-emoji>') == '<tg-emoji emoji-id="1">emoji</tg-emoji>'
144+
assert html.sanitize('<tg-emoji emoji-id=0>emoji</tg-emoji>') == 'emoji'
145+
assert html.sanitize('<tg-emoji emoji-id=10000000000000000000>emoji</tg-emoji>') == 'emoji'
146+
assert html.sanitize('<tg-emoji emoji-id=dummy>emoji</tg-emoji>') == 'emoji'
147+
148+
assert html.sanitize('<code>code</code>') == '<code>code</code>'
149+
assert html.sanitize(
150+
'<code class="language-python">code</code>') == '<code class="language-python">code</code>'
151+
assert html.sanitize('<code class="other">code</code>') == '<code>code</code>'

0 commit comments

Comments
 (0)