Skip to content

Commit 436a6b4

Browse files
committed
Use simpler tokenizing regex for license scan #655
* Somehow some JS blurb was getting in some dark corner of the current regexes. These were too complex anyway. The new ones are simpler and faster and do not have such a bad behavior. * Add new tests at the tokenize and query level. Reported-by Jarnu Girdhar <[email protected]> Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent 5995505 commit 436a6b4

File tree

7 files changed

+127
-34
lines changed

7 files changed

+127
-34
lines changed

src/licensedcode/tokenize.py

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# -*- coding: utf-8 -*-
22
#
3-
# Copyright (c) 2015 nexB Inc. and others. All rights reserved.
3+
# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
44
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
55
# The ScanCode software is licensed under the Apache License version 2.0.
66
# Data generated with ScanCode require an acknowledgment.
@@ -64,13 +64,11 @@ def query_lines(location=None, query_string=None, strip=True):
6464
yield line
6565

6666

67-
# Split on whitespace and punctuations: keep only characters and +.
68-
# Keeping the + is important for licenses name such as GPL2+.
69-
70-
_letter_or_digit = '[a-zA-Z0-9]+ ?\+'
71-
_not_punctuation = '[^!\"#\$%&\'\(\)\*,\-\./:;<=>\?@\[\]\^_`\{\|\}\\\~\s\+\x92\x93\x94”“’–]'
72-
query_pattern = _letter_or_digit + '|' + _not_punctuation
73-
word_splitter = re.compile('(?:%s)+' % query_pattern, re.UNICODE).findall
67+
# Split on whitespace and punctuations: keep only characters
68+
# and + in the middle or end of a word.
69+
# Keeping the trailing + is important for licenses name such as GPL2+
70+
query_pattern = '[^\W_]+\+?[^\W_]*'
71+
word_splitter = re.compile(query_pattern, re.UNICODE).findall
7472

7573
def query_tokenizer(text, lower=True):
7674
"""
@@ -83,9 +81,10 @@ def query_tokenizer(text, lower=True):
8381

8482

8583
# Alternate pattern used for matched text collection
84+
not_query_pattern = '[\W_+]+[\W_]?'
85+
8686
# collect tokens and non-token texts in two different groups
87-
_punctuation = '[!\"#\$%&\'\(\)\*,\-\./:;<=>\?@\[\]\^_`\{\|\}\\\~\s\+\x92\x93\x94”“’–]'
88-
_text_capture_pattern = '(?P<token>(?:' + query_pattern + ')+)' + '|' + '(?P<punct>' + _punctuation + '+)'
87+
_text_capture_pattern = '(?P<token>' + query_pattern + ')' + '|' + '(?P<punct>' + not_query_pattern + ')'
8988
tokens_and_non_tokens = re.compile(_text_capture_pattern, re.UNICODE).finditer
9089

9190
def matched_query_text_tokenizer(text):
@@ -114,11 +113,9 @@ def matched_query_text_tokenizer(text):
114113
# {{something}} for templates. curly barces are otherwise treated as punctuation.
115114
# A template part is anything enclosed in double braces
116115
template_pattern = '\{\{[^{}]*\}\}'
117-
rule_pattern = '(?:%s)+|%s+' % (query_pattern, template_pattern,)
118-
# rule_pattern = template_pattern
116+
rule_pattern = '%s|%s+' % (query_pattern, template_pattern,)
119117
template_splitter = re.compile(rule_pattern , re.UNICODE).findall
120118

121-
122119
def rule_tokenizer(text, lower=True):
123120
"""
124121
Return an iterable of tokens from a unicode rule text, skipping templated
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
--- src_one 1970-01-01 01:00:00.000000000 +0100
2+
+++ src_one 2009-03-06 16:24:43.000000000 +0100
3+
@@ -0,0 +1,155 @@
4+
+/*
5+
+ * Copyright (C) 2009 Mycompany Inc. All rights reserved.
6+
+ *
7+
8+
9+
+ * This library is free software; you can redistribute it and/or
10+
+ * modify it under the terms of the GNU Library General Public
11+
+ * License as published by the Free Software Foundation; either
12+
+ * version 2 of the License, or (at your option) any later version.
13+
+ *
14+
+ * This library is distributed in the hope that it will be useful,
15+
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
16+
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17+
+ * Library General Public License for more details.
18+
+ *
19+
+ * You should have received a copy of the GNU Library General Public License
20+
+ * along with this library; see the file COPYING.LIB. If not, write to
21+
+ * the Free Software Foundation,., 51 Franklin Street, Fifth Floor,
22+
+ * Boston, MA 02110-1301, USA.
23+
+ */
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
+ * This library is free software; you can redistribute it and/or
2+
+ * modify it under the terms of the GNU Library General Public
3+
+ * License as published by the Free Software Foundation; either
4+
+ * version 2 of the License, or (at your option) any later version.
5+
+ *
6+
+ * This library is distributed in the hope that it will be useful,
7+
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
8+
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9+
+ * Library General Public License for more details.
10+
+ *
11+
+ * You should have received a copy of the GNU Library General Public License
12+
+ * along with this library; see the file COPYING.LIB. If not, write to
13+
+ * the Free Software Foundation., 51 Franklin Street, Fifth Floor,
14+
+ * Boston, MA 02110-1301, USA.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
licenses:
2+
- lgpl-2.0-plus

tests/licensedcode/data/tokenize/parser.js

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/licensedcode/test_query.py

Lines changed: 54 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -528,9 +528,7 @@ def test_query_run_has_correct_offset(self):
528528
result = [qr.to_dict() for qr in q.query_runs]
529529
expected = [
530530
{'end': 0, 'start': 0, 'tokens': u'inc'},
531-
{
532-
'end': 123,
533-
'start': 1,
531+
{'end': 123, 'start': 1,
534532
'tokens': (
535533
u'this library is free software you can redistribute it and or modify '
536534
u'it under the terms of the gnu library general public license as '
@@ -542,10 +540,43 @@ def test_query_run_has_correct_offset(self):
542540
u'license for more details you should have received a copy of the gnu '
543541
u'library general public license along with this library see the file '
544542
u'copying lib if not write to the free software foundation inc 51 '
545-
u'franklin street fifth floor boston ma 02110 1301 usa'
546-
)
547-
}]
543+
u'franklin street fifth floor boston ma 02110 1301 usa')
544+
}
545+
]
546+
assert expected == result
547+
548+
def test_query_run_and_tokenizing_breaking_works__with_plus_as_expected(self):
549+
rule_dir = self.get_test_loc('query/run_breaking/rules')
550+
rules = list(models.load_rules(rule_dir))
551+
idx = index.LicenseIndex(rules)
552+
query_doc = self.get_test_loc('query/run_breaking/query.txt')
553+
q = Query(query_doc, idx=idx)
554+
result = [qr.to_dict() for qr in q.query_runs]
555+
expected = [
556+
{'end': 121, 'start': 0,
557+
'tokens':
558+
'this library is free software you can redistribute it '
559+
'and or modify it under the terms of the gnu library '
560+
'general public license as published by the free software '
561+
'foundation either version 2 of the license or at your '
562+
'option any later version this library is distributed in '
563+
'the hope that it will be useful but without any warranty '
564+
'without even the implied warranty of merchantability or '
565+
'fitness for a particular purpose see the gnu library '
566+
'general public license for more details you should have '
567+
'received a copy of the gnu library general public '
568+
'license along with this library see the file copying lib '
569+
'if not write to the free software foundation 51 franklin '
570+
'street fifth floor boston ma 02110 1301 usa'}
571+
]
572+
548573
assert expected == result
574+
q.tokens
575+
# check rules token are the same exact set as the set of the last query run
576+
txtid = idx.tokens_by_tid
577+
qrt = [txtid[t] for t in q.query_runs[-1].tokens]
578+
irt = [txtid[t] for t in idx.tids_by_rid[0]]
579+
assert irt == qrt
549580

550581

551582
class TestQueryWithFullIndex(FileBasedTesting):
@@ -590,11 +621,16 @@ def test_query_run_tokens(self):
590621
assert 1 == len(result.query_runs)
591622
qr = result.query_runs[0]
592623
# NOTE: this is not a token present in any rules or licenses
624+
unknown_tokens = ('baridationally',)
625+
assert unknown_tokens not in idx.dictionary
626+
assert u' '.join([t for t in query_s.split() if t not in unknown_tokens]) == u' '.join(idx.tokens_by_tid[t] for t in qr.tokens)
627+
628+
def test_query_run_tokens_matchable(self):
629+
idx = cache.get_index()
630+
# NOTE: this is not a token present in any rules or licenses
593631
unknown_token = u'baridationally'
594632
assert unknown_token not in idx.dictionary
595-
assert u' '.join([t for t in query_s.split() if t not in (unknown_token, 'proc')]) == u' '.join(idx.tokens_by_tid[t] for t in qr.tokens)
596633

597-
def test_query_run_tokens_matchable(self):
598634
query_s = u' '.join(u'''
599635
600636
3 unable to create proc entry license gpl description driver author eric
@@ -607,27 +643,24 @@ def test_query_run_tokens_matchable(self):
607643
linux include asm include asm generic include acpi acpi c posix types 32 h
608644
types h types h h h h h
609645
'''.split())
610-
idx = cache.get_index()
611646
result = Query(query_string=query_s, idx=idx)
612-
613647
assert 1 == len(result.query_runs)
614648
qr = result.query_runs[0]
615649
expected_qr0 = u' '.join(u'''
616-
3 unable to create entry license gpl description driver author eric depends 2
617-
6 24 19 generic smp mod module acpi register driver acpi disabled acpi
618-
install notify acpi get status cache caches create entry generate event acpi
619-
evaluate object acpi remove notify remove entry acpi driver acpi acpi gcc gnu
620-
4 2 3 ubuntu 4 2 3 gcc gnu 4 2 3 ubuntu 4 2 3 current stack pointer current
621-
stack pointer this module end usr src modules acpi include linux include asm
622-
include asm generic include acpi acpi c posix types 32 h types h types h h h
623-
h h
650+
3 unable to create proc entry license gpl description driver author eric
651+
depends 2 6 24 19 generic smp mod module acpi register driver
652+
proc acpi disabled acpi install notify acpi get status cache
653+
caches create proc entry generate proc event acpi evaluate
654+
object acpi remove notify remove proc entry acpi driver acpi
655+
acpi gcc gnu 4 2 3 ubuntu 4 2 3 gcc gnu 4 2 3 ubuntu 4 2 3 current stack
656+
pointer current stack pointer this module end usr src modules acpi include
657+
linux include asm include asm generic include acpi acpi c posix types 32 h
658+
types h types h h h h h
624659
'''.split())
625660
assert expected_qr0 == u' '.join(idx.tokens_by_tid[t] for t in qr.tokens)
626661

627-
# NOTE: this is not a token present in any rules or licenses
628-
unknown_token = u'baridationally'
629-
assert unknown_token not in idx.dictionary
630662
assert expected_qr0 == u' '.join(idx.tokens_by_tid[t] for p, t in enumerate(qr.tokens) if p in qr.matchables)
631663

664+
# only gpl is in high matchables
632665
expected = u'gpl'
633666
assert expected == u' '.join(idx.tokens_by_tid[t] for p, t in enumerate(qr.tokens) if p in qr.high_matchables)

tests/licensedcode/test_tokenize.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828

2929
import codecs
3030
import itertools
31+
from time import time
3132
import os
3233

3334
from commoncode.testcase import FileBasedTesting
@@ -368,6 +369,27 @@ def test_rule_tokenizer_handles_combination_of_well_formed_and_ill_formed_templa
368369
text = u'}}{{{{abcd}}ddd}}{{'
369370
assert [u'ddd'] == list(rule_tokenizer(text))
370371

372+
def test_tokenizers_regex_do_not_choke_on_some_text(self):
373+
# somehow this text was making the regex choke.
374+
tf = self.get_test_loc('tokenize/parser.js')
375+
with codecs.open(tf, 'rb', encoding='utf-8') as text:
376+
content = text.read()
377+
378+
start = time()
379+
list(rule_tokenizer(content))
380+
duration = time() - start
381+
assert duration < 5
382+
383+
start = time()
384+
list(query_tokenizer(content))
385+
duration = time() - start
386+
assert duration < 5
387+
388+
start = time()
389+
list(matched_query_text_tokenizer(content))
390+
duration = time() - start
391+
assert duration < 5
392+
371393

372394
class TestNgrams(FileBasedTesting):
373395
test_data_dir = TEST_DATA_DIR

0 commit comments

Comments
 (0)