Skip to content

Commit cbc2cee

Browse files
adrianeboydhonnibal
authored andcommitted
Improve URL_PATTERN and handling in tokenizer (#4374)
* Move prefix and suffix detection for URL_PATTERN Move prefix and suffix detection for `URL_PATTERN` into the tokenizer. Remove associated lookahead and lookbehind from `URL_PATTERN`. Fix tokenization for Hungarian given new modified handling of prefixes and suffixes. * Match a wider range of URI schemes
1 parent e65dffd commit cbc2cee

File tree

4 files changed

+15
-17
lines changed

4 files changed

+15
-17
lines changed

spacy/lang/hu/punctuation.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@
1010

1111
_currency = r"\$¢£€¥฿"
1212
_quotes = CONCAT_QUOTES.replace("'", "")
13+
_units = UNITS.replace("%", "")
1314

1415
_prefixes = (
15-
[r"\+"]
16-
+ LIST_PUNCT
16+
LIST_PUNCT
1717
+ LIST_ELLIPSES
1818
+ LIST_QUOTES
1919
+ [_concat_icons]
@@ -29,7 +29,7 @@
2929
r"(?<=[0-9])\+",
3030
r"(?<=°[FfCcKk])\.",
3131
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
32-
r"(?<=[0-9])(?:{u})".format(u=UNITS),
32+
r"(?<=[0-9])(?:{u})".format(u=_units),
3333
r"(?<=[{al}{e}{q}(?:{c})])\.".format(
3434
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
3535
),

spacy/lang/tokenizer_exceptions.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,9 @@
1010
# A few minor mods to this regex to account for use cases represented in test_urls
1111
URL_PATTERN = (
1212
r"^"
13-
# in order to support the prefix tokenization (see prefix test cases in test_urls).
14-
r"(?=[\w])"
15-
# protocol identifier
16-
r"(?:(?:https?|ftp|mailto)://)?"
17-
# user:pass authentication
13+
# protocol identifier (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
14+
r"(?:(?:[\w\+\-\.]{2,})://)?"
15+
# mailto:user or user:pass authentication
1816
r"(?:\S+(?::\S*)?@)?"
1917
r"(?:"
2018
# IP address exclusion
@@ -43,11 +41,7 @@
4341
# port number
4442
r"(?::\d{2,5})?"
4543
# resource path
46-
r"(?:/\S*)?"
47-
# query parameters
48-
r"\??(:?\S*)?"
49-
# in order to support the suffix tokenization (see suffix test cases in test_urls),
50-
r"(?<=[\w/])"
44+
r"(?:[/?#]\S*)?"
5145
r"$"
5246
).strip()
5347

spacy/tests/tokenizer/test_urls.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
URLS_FULL = URLS_BASIC + [
1414
"mailto:foo-bar@baz-co.com",
15+
"mailto:foo-bar@baz-co.com?subject=hi",
1516
"www.google.com?q=google",
1617
"http://foo.com/blah_(wikipedia)#cite-1",
1718
]
@@ -45,6 +46,10 @@
4546
"http://a.b-c.de",
4647
"http://223.255.255.254",
4748
"http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
49+
"ssh://login@server.com:12345/repository.git",
50+
"svn+ssh://user@ssh.yourdomain.com/path",
51+
pytest.param("chrome://extensions/?id=mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()),
52+
pytest.param("chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()),
4853
pytest.param("http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()),
4954
pytest.param(
5055
"http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()
@@ -81,7 +86,6 @@
8186
"http:// shouldfail.com",
8287
":// should fail",
8388
"http://foo.bar/foo(bar)baz quux",
84-
"ftps://foo.bar/",
8589
"http://-error-.invalid/",
8690
"http://a.b-.co",
8791
"http://0.0.0.0",

spacy/tokenizer.pyx

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,9 @@ cdef class Tokenizer:
227227
cdef unicode minus_suf
228228
cdef size_t last_size = 0
229229
while string and len(string) != last_size:
230-
if self.token_match and self.token_match(string):
230+
if self.token_match and self.token_match(string) \
231+
and not self.find_prefix(string) \
232+
and not self.find_suffix(string):
231233
break
232234
if self._specials.get(hash_string(string)) != NULL:
233235
has_special[0] = 1
@@ -243,8 +245,6 @@ cdef class Tokenizer:
243245
prefixes.push_back(self.vocab.get(mem, prefix))
244246
has_special[0] = 1
245247
break
246-
if self.token_match and self.token_match(string):
247-
break
248248
suf_len = self.find_suffix(string)
249249
if suf_len != 0:
250250
suffix = string[-suf_len:]

0 commit comments

Comments
 (0)