Use simpler tokenizing regex for license scan #655

pombredanne · pombredanne · commit 436a6b48f3a9 · 2017-06-29T19:31:37.000+02:00
* Somehow some JS blurb was getting in some dark corner of the current
   regexes. These were too complex anyway. The new ones are simpler and
   faster and do not have such a bad behavior.
 * Add new tests at the tokenize and query level.

Reported-by Jarnu Girdhar &lt;jarnu.girdhar@gmail.com&gt;
Signed-off-by: Philippe Ombredanne &lt;pombredanne@nexb.com&gt;
diff --git a/src/licensedcode/tokenize.py b/src/licensedcode/tokenize.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (c) 2015 nexB Inc. and others. All rights reserved.
+# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
 # http://nexb.com and https://github.com/nexB/scancode-toolkit/
 # The ScanCode software is licensed under the Apache License version 2.0.
 # Data generated with ScanCode require an acknowledgment.
@@ -64,13 +64,11 @@ def query_lines(location=None, query_string=None, strip=True):
             yield line
 
 
-# Split on whitespace and punctuations: keep only characters and +.
-# Keeping the + is important for licenses name such as GPL2+.
-
-_letter_or_digit = '[a-zA-Z0-9]+ ?\+'
-_not_punctuation = '[^!\"#\$%&\'\(\)\*,\-\./:;<=>\?@\[\]\^_`\{\|\}\\\~\s\+\x92\x93\x94”“’–]'
-query_pattern = _letter_or_digit + '|' + _not_punctuation
-word_splitter = re.compile('(?:%s)+' % query_pattern, re.UNICODE).findall
+# Split on whitespace and punctuations: keep only characters
+# and + in the middle or end of a word.
+# Keeping the trailing + is important for licenses name such as GPL2+
+query_pattern = '[^\W_]+\+?[^\W_]*'
+word_splitter = re.compile(query_pattern, re.UNICODE).findall
 
 def query_tokenizer(text, lower=True):
     """
@@ -83,9 +81,10 @@ def query_tokenizer(text, lower=True):
 
 
 # Alternate pattern used for matched text collection
+not_query_pattern = '[\W_+]+[\W_]?'
+
 # collect tokens and non-token texts in two different groups
-_punctuation = '[!\"#\$%&\'\(\)\*,\-\./:;<=>\?@\[\]\^_`\{\|\}\\\~\s\+\x92\x93\x94”“’–]'
-_text_capture_pattern = '(?P<token>(?:' + query_pattern + ')+)' + '|' + '(?P<punct>' + _punctuation + '+)'
+_text_capture_pattern = '(?P<token>' + query_pattern + ')' + '|' + '(?P<punct>' + not_query_pattern + ')'
 tokens_and_non_tokens = re.compile(_text_capture_pattern, re.UNICODE).finditer
 
 def matched_query_text_tokenizer(text):
@@ -114,11 +113,9 @@ def matched_query_text_tokenizer(text):
 # {{something}} for templates. curly barces are otherwise treated as punctuation.
 # A template part is anything enclosed in double braces
 template_pattern = '\{\{[^{}]*\}\}'
-rule_pattern = '(?:%s)+|%s+' % (query_pattern, template_pattern,)
-# rule_pattern = template_pattern
+rule_pattern = '%s|%s+' % (query_pattern, template_pattern,)
 template_splitter = re.compile(rule_pattern , re.UNICODE).findall
 
-
 def rule_tokenizer(text, lower=True):
     """
     Return an iterable of tokens from a unicode rule text, skipping templated
diff --git a/tests/licensedcode/data/query/run_breaking/query.txt b/tests/licensedcode/data/query/run_breaking/query.txt
@@ -0,0 +1,23 @@
+--- src_one	1970-01-01 01:00:00.000000000 +0100
++++ src_one	2009-03-06 16:24:43.000000000 +0100
+@@ -0,0 +1,155 @@
++/*
++ * Copyright (C) 2009 Mycompany Inc. All rights reserved.
++ *
+
+
++ * This library is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Library General Public
++ * License as published by the Free Software Foundation; either
++ * version 2 of the License, or (at your option) any later version.
++ *
++ * This library is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Library General Public License for more details.
++ *
++ * You should have received a copy of the GNU Library General Public License
++ * along with this library; see the file COPYING.LIB.  If not, write to
++ * the Free Software Foundation,., 51 Franklin Street, Fifth Floor,
++ * Boston, MA 02110-1301, USA.
++ */
diff --git a/tests/licensedcode/data/query/run_breaking/rules/lgpl-2.0-plus_23.RULE b/tests/licensedcode/data/query/run_breaking/rules/lgpl-2.0-plus_23.RULE
@@ -0,0 +1,14 @@
++ * This library is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Library General Public
++ * License as published by the Free Software Foundation; either
++ * version 2 of the License, or (at your option) any later version.
++ *
++ * This library is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Library General Public License for more details.
++ *
++ * You should have received a copy of the GNU Library General Public License
++ * along with this library; see the file COPYING.LIB.  If not, write to
++ * the Free Software Foundation., 51 Franklin Street, Fifth Floor,
++ * Boston, MA 02110-1301, USA.
diff --git a/tests/licensedcode/data/query/run_breaking/rules/lgpl-2.0-plus_23.yml b/tests/licensedcode/data/query/run_breaking/rules/lgpl-2.0-plus_23.yml
@@ -0,0 +1,2 @@
+licenses:
+    - lgpl-2.0-plus
diff --git a/tests/licensedcode/data/tokenize/parser.js b/tests/licensedcode/data/tokenize/parser.js
diff --git a/tests/licensedcode/test_query.py b/tests/licensedcode/test_query.py
@@ -528,9 +528,7 @@ def test_query_run_has_correct_offset(self):
         result = [qr.to_dict() for qr in q.query_runs]
         expected = [
             {'end': 0, 'start': 0, 'tokens': u'inc'},
-            {
-            'end': 123,
-            'start': 1,
+            {'end': 123, 'start': 1,
             'tokens': (
                 u'this library is free software you can redistribute it and or modify '
                 u'it under the terms of the gnu library general public license as '
@@ -542,10 +540,43 @@ def test_query_run_has_correct_offset(self):
                 u'license for more details you should have received a copy of the gnu '
                 u'library general public license along with this library see the file '
                 u'copying lib if not write to the free software foundation inc 51 '
-                u'franklin street fifth floor boston ma 02110 1301 usa'
-            )
-        }]
+                u'franklin street fifth floor boston ma 02110 1301 usa')
+             }
+        ]
+        assert expected == result
+
+    def test_query_run_and_tokenizing_breaking_works__with_plus_as_expected(self):
+        rule_dir = self.get_test_loc('query/run_breaking/rules')
+        rules = list(models.load_rules(rule_dir))
+        idx = index.LicenseIndex(rules)
+        query_doc = self.get_test_loc('query/run_breaking/query.txt')
+        q = Query(query_doc, idx=idx)
+        result = [qr.to_dict() for qr in q.query_runs]
+        expected = [
+            {'end': 121, 'start': 0,
+             'tokens': 
+                'this library is free software you can redistribute it '
+                'and or modify it under the terms of the gnu library '
+                'general public license as published by the free software '
+                'foundation either version 2 of the license or at your '
+                'option any later version this library is distributed in '
+                'the hope that it will be useful but without any warranty '
+                'without even the implied warranty of merchantability or '
+                'fitness for a particular purpose see the gnu library '
+                'general public license for more details you should have '
+                'received a copy of the gnu library general public '
+                'license along with this library see the file copying lib '
+                'if not write to the free software foundation 51 franklin '
+                'street fifth floor boston ma 02110 1301 usa'}
+        ]
+
         assert expected == result
+        q.tokens
+        # check rules token are the same exact set as the set of the last query run
+        txtid = idx.tokens_by_tid
+        qrt = [txtid[t] for t in q.query_runs[-1].tokens]
+        irt = [txtid[t] for t in idx.tids_by_rid[0]]
+        assert irt == qrt
 
 
 class TestQueryWithFullIndex(FileBasedTesting):
@@ -590,11 +621,16 @@ def test_query_run_tokens(self):
         assert 1 == len(result.query_runs)
         qr = result.query_runs[0]
         # NOTE: this is not a token present in any rules or licenses
+        unknown_tokens = ('baridationally',)
+        assert unknown_tokens not in idx.dictionary
+        assert u' '.join([t for t in query_s.split() if t not in unknown_tokens]) == u' '.join(idx.tokens_by_tid[t] for t in qr.tokens)
+
+    def test_query_run_tokens_matchable(self):
+        idx = cache.get_index()
+        # NOTE: this is not a token present in any rules or licenses
         unknown_token = u'baridationally'
         assert unknown_token not in idx.dictionary
-        assert u' '.join([t for t in query_s.split() if t not in (unknown_token, 'proc')]) == u' '.join(idx.tokens_by_tid[t] for t in qr.tokens)
 
-    def test_query_run_tokens_matchable(self):
         query_s = u' '.join(u'''
 
         3 unable to create proc entry license gpl description driver author eric
@@ -607,27 +643,24 @@ def test_query_run_tokens_matchable(self):
         linux include asm include asm generic include acpi acpi c posix types 32 h
         types h types h h h h h
         '''.split())
-        idx = cache.get_index()
         result = Query(query_string=query_s, idx=idx)
-
         assert 1 == len(result.query_runs)
         qr = result.query_runs[0]
         expected_qr0 = u' '.join(u'''
-        3 unable to create entry license gpl description driver author eric depends 2
-        6 24 19 generic smp mod module acpi register driver acpi disabled acpi
-        install notify acpi get status cache caches create entry generate event acpi
-        evaluate object acpi remove notify remove entry acpi driver acpi acpi gcc gnu
-        4 2 3 ubuntu 4 2 3 gcc gnu 4 2 3 ubuntu 4 2 3 current stack pointer current
-        stack pointer this module end usr src modules acpi include linux include asm
-        include asm generic include acpi acpi c posix types 32 h types h types h h h
-        h h
+        3 unable to create proc entry license gpl description driver author eric
+        depends 2 6 24 19 generic smp mod module acpi             register driver
+        proc acpi disabled acpi install notify acpi               get status cache
+        caches create proc entry                generate proc event acpi evaluate
+        object acpi remove notify remove proc entry acpi             driver acpi
+        acpi gcc gnu 4 2 3 ubuntu 4 2 3 gcc gnu 4 2 3 ubuntu 4 2 3 current stack
+        pointer current stack pointer this module end usr src modules acpi include
+        linux include asm include asm generic include acpi acpi c posix types 32 h
+        types h types h h h h h
         '''.split())
         assert expected_qr0 == u' '.join(idx.tokens_by_tid[t] for t in qr.tokens)
 
-        # NOTE: this is not a token present in any rules or licenses
-        unknown_token = u'baridationally'
-        assert unknown_token not in idx.dictionary
         assert expected_qr0 == u' '.join(idx.tokens_by_tid[t] for p, t in enumerate(qr.tokens) if p in qr.matchables)
 
+        # only gpl is in high matchables
         expected = u'gpl'
         assert expected == u' '.join(idx.tokens_by_tid[t] for p, t in enumerate(qr.tokens) if p in qr.high_matchables)
diff --git a/tests/licensedcode/test_tokenize.py b/tests/licensedcode/test_tokenize.py
@@ -28,6 +28,7 @@
 
 import codecs
 import itertools
+from time import time
 import os
 
 from commoncode.testcase import FileBasedTesting
@@ -368,6 +369,27 @@ def test_rule_tokenizer_handles_combination_of_well_formed_and_ill_formed_templa
         text = u'}}{{{{abcd}}ddd}}{{'
         assert [u'ddd'] == list(rule_tokenizer(text))
 
+    def test_tokenizers_regex_do_not_choke_on_some_text(self):
+        # somehow this text was making the regex choke.
+        tf = self.get_test_loc('tokenize/parser.js')
+        with codecs.open(tf, 'rb', encoding='utf-8') as text:
+            content = text.read()
+
+        start = time()
+        list(rule_tokenizer(content))
+        duration = time() - start
+        assert duration < 5
+
+        start = time()
+        list(query_tokenizer(content))
+        duration = time() - start
+        assert duration < 5
+
+        start = time()
+        list(matched_query_text_tokenizer(content))
+        duration = time() - start
+        assert duration < 5
+
 
 class TestNgrams(FileBasedTesting):
     test_data_dir = TEST_DATA_DIR