Skip to content

Commit 4820507

Browse files
author
Pietro Albini
committed
Fix syntax detection screwed up by wrong URLs (fixes #28)
Before this commit, if you had two underscores in an URL the whole message was marked as markdown. This caused issues because Telegram clients refuses to mark links as links if they contain syntax in them. This commit strips URLs and email addresses from the messages before checking if they're markdown or not, so this bug is fixed. Also, there are new tests to prevent this issue from happening in the future.
1 parent 68c3226 commit 4820507

File tree

4 files changed

+33
-2
lines changed

4 files changed

+33
-2
lines changed

botogram/syntaxes.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
import re
1010

11+
from . import utils
12+
1113

1214
_markdown_re = re.compile(r".*("
1315
r"\*(.*)\*|"
@@ -30,11 +32,16 @@
3032

3133
def is_markdown(message):
3234
"""Check if a string is actually markdown"""
35+
# Don't mark part of URLs or email addresses as Markdown
36+
message = utils.strip_urls(message)
37+
3338
return bool(_markdown_re.match(message))
3439

3540

3641
def is_html(message):
3742
"""Check if a string is actually HTML"""
43+
# Here URLs are not stripped because no sane URL contains HTML tags in it,
44+
# and for a few cases the speed penality is not worth
3845
return bool(_html_re.match(message))
3946

4047

botogram/utils.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,12 @@
1717
import logbook
1818
import functools
1919

20+
# URLs regex created by http://twitter.com/imme_emosol
21+
2022
_username_re = re.compile(r"\@([a-zA-Z0-9_]{5}[a-zA-Z0-9_]*)")
2123
_command_re = re.compile(r"^\/[a-zA-Z0-9_]+(\@[a-zA-Z0-9_]{5}[a-zA-Z0-9_]*)?$")
2224
_email_re = re.compile(r"[a-zA-Z0-9_\.\+\-]+\@[a-zA-Z0-9_\.\-]+\.[a-zA-Z]+")
25+
_url_re = re.compile(r"https?://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?")
2326

2427
# This small piece of global state will track if logbook was configured
2528
_logger_configured = False
@@ -139,15 +142,22 @@ def docstring_of(func, bot=None, component_id=None):
139142
return format_docstr(docstring)
140143

141144

145+
def strip_urls(string):
146+
"""Strip URLs and emails from a string"""
147+
string = _url_re.sub("", string)
148+
string = _email_re.sub("", string)
149+
return string
150+
151+
142152
def usernames_in(message):
143153
"""Return all the matched usernames in the message"""
144154
# Don't parse usernames in the commands
145155
if _command_re.match(message.split(" ", 1)[0]):
146156
message = message.split(" ", 1)[1]
147157

148158
# Strip email addresses from the message, in order to avoid matching the
149-
# user's domain. This also happens to match username/passwords in URLs
150-
message = _email_re.sub("", message)
159+
# user's domain. Also strip URLs, in order to avoid usernames in them.
160+
message = strip_urls(message)
151161

152162
results = []
153163
for result in _username_re.finditer(message):

docs/changelog.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,17 @@ botogram 0.2
1818

1919
No changes yet.
2020

21+
.. _changelog-0.1.2:
22+
23+
botogram 0.1.2
24+
==============
25+
26+
*Bugfix release, not yet released*
27+
28+
* Fix automatic syntax detector recognizing markdown in URLs (`issue 28`_)
29+
30+
.. _issue 28: https://github.com/pietroalbini/botogram/issues/28
31+
2132
.. _changelog-0.1.1:
2233

2334
botogram 0.1.1

tests/test_syntaxes.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ def test_is_markdown():
2121
assert botogram.syntaxes.is_markdown("[a](b)")
2222
assert botogram.syntaxes.is_markdown("![a](b)!")
2323

24+
assert not botogram.syntaxes.is_markdown("hey@this_is_awesome.com")
25+
assert not botogram.syntaxes.is_markdown("https://www.this_is_awesome.com")
26+
2427

2528
def test_is_html():
2629
assert not botogram.syntaxes.is_html("not HTML, sorry!")

0 commit comments

Comments
 (0)