Correct license text collection bug

pombredanne · pombredanne · commit b0020832cd50 · 2019-11-11T14:43:23.000+01:00
The license text was truncted in some cases because of unicode
diacritics behaving slightly differently in regular expression when
lowercased or or not.

This led to certain lowercased diacritics not being the same with the
regular query tokenizer an the matched text tokenizer and leading to
an off by one error and a truncation of the collected license text.

This fix changes how the check is done to verify if a token is a known
token or not.

This also introduce a "highlight" argument to the
LicenseMatch.matched_text() methods and related functions


Signed-off-by: Philippe Ombredanne &lt;pombredanne@nexb.com&gt;
diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py
@@ -37,6 +37,7 @@
 from licensedcode.spans import Span
 from licensedcode.stopwords import STOPWORDS
 from licensedcode.tokenize import matched_query_text_tokenizer
+from licensedcode.tokenize import query_tokenizer
 
 
 """
@@ -532,7 +533,7 @@ def itokens_hash(self, idx):
 
     # FIXME: this should be done for all the matches found in a given scanned
     # location at once to avoid reprocessing many times the original text
-    def matched_text(self, whole_lines=False,
+    def matched_text(self, whole_lines=False, highlight=True,
                      highlight_matched=u'%s', highlight_not_matched=u'[%s]',
                      _usecache=True):
         """
@@ -558,6 +559,7 @@ def matched_text(self, whole_lines=False,
             query_string=query.query_string,
             idx=query.idx,
             whole_lines=whole_lines,
+            highlight=highlight,
             highlight_matched=highlight_matched,
             highlight_not_matched=highlight_not_matched, _usecache=_usecache)
         ).rstrip()
@@ -1384,15 +1386,16 @@ def _log(_matches, _discarded, msg):
 @attr.s(slots=True, frozen=True)
 class Token(object):
     """
-    Used to represent a token in collected matched texts and SPDX identifiers.
+    Used to represent a token in collected query-side matched texts and SPDX
+    identifiers.
     """
     # original text value for this token.
     value = attr.ib()
     # line number, one-based
     line_num = attr.ib()
     # absolute position for known tokens, zero-based. -1 for unknown tokens
     pos = attr.ib(default=-1)
-    # False if this is punctuation
+    # False if this is punctuation or spaces
     is_text = attr.ib(default=False)
     # True if part of a match
     is_matched = attr.ib(default=False)
@@ -1405,6 +1408,8 @@ def tokenize_matched_text(location, query_string, dictionary, _cache={}):
     Return a list of Token objects with pos and line number collected from the
     file at `location` or the `query_string` string. `dictionary` is the index
     mapping of tokens to token ids.
+
+    NOTE: the _cache={} arg IS A GLOBAL by design.
     """
     key = location, query_string
     cached = _cache.get(key)
@@ -1425,26 +1430,52 @@ def _tokenize_matched_text(location, query_string, dictionary):
     """
     pos = -1
     for line_num, line in query.query_lines(location, query_string, strip=False):
+        if TRACE_MATCHED_TEXT_DETAILS:
+            logger_debug('  _tokenize_matched_text:',
+                'line_num:', line_num,
+                'line:', line)
+
         for is_text, token_str in matched_query_text_tokenizer(line):
-            known = token_str.lower() in dictionary
+            if TRACE_MATCHED_TEXT_DETAILS:
+                logger_debug('     is_text:', is_text, 'token_str:', repr(token_str))
+            known = False
+            if token_str and token_str.strip():
+                # we retokenzie using the query tokenize
+                tokenized = list(query_tokenizer(token_str))
+                if tokenized:
+                    assert len(tokenized) == 1, repr((is_text, token_str, tokenized))
+                    tokenized = tokenized[0]
+                    known = tokenized in dictionary
+
             if known:
                 pos += 1
                 p = pos
             else:
                 p = -1
-            yield Token(
+
+            tok = Token(
                 value=token_str,
                 line_num=line_num,
                 is_text=is_text,
                 is_known=known,
                 pos=p)
 
+            if TRACE_MATCHED_TEXT_DETAILS:
+                logger_debug('     token:', tok)
+            yield tok
+
 
 def reportable_tokens(tokens, match_qspan, start_line, end_line, whole_lines=False):
     """
-    Yield Tokens from an iterable of `tokens` that are inside a `match_qspan`
-    matched Span starting at `start_line` and ending at `end_line`. Known
-    matched tokens are tagged as is_matched=True.
+    Yield Tokens from a `tokens` iterable of Token objects (built from a query-
+    side scanned file or string) that are inside a `match_qspan` matched Span
+    starting at `start_line` and ending at `end_line`. If whole_lines is True,
+    also yield unmatched Tokens that are before and after the match and on the
+    first and last line of a match (unless the lines are very long text lines or
+    the match is from binary content.)
+
+    As a side effect, known matched tokens are tagged as is_matched=True if they
+    are matched.
 
     If `whole_lines` is True, any token within matched lines range is included.
     Otherwise, a token is included if its position is within the matched
@@ -1475,7 +1506,12 @@ def reportable_tokens(tokens, match_qspan, start_line, end_line, whole_lines=Fal
             tok = attr.evolve(tok, is_matched=True)
             is_included = True
             if TRACE_MATCHED_TEXT_DETAILS:
-                logger_debug('  tok.is_matched = True')
+                logger_debug('  tok.is_matched = True', 'match_qspan:', match_qspan)
+        else:
+            if TRACE_MATCHED_TEXT_DETAILS:
+                logger_debug('  unmatched token: tok.is_matched = False',
+                             'match_qspan:', match_qspan,
+                             'tok.pos in match_qspan:', tok.pos in match_qspan)
 
         if whole_lines:
             # we only work on matched lines so no need to test further
@@ -1527,7 +1563,7 @@ def reportable_tokens(tokens, match_qspan, start_line, end_line, whole_lines=Fal
 def get_full_matched_text(
         match, location=None, query_string=None, idx=None,
         whole_lines=False,
-        highlight_matched=u'%s', highlight_not_matched=u'[%s]',
+        highlight=True, highlight_matched=u'%s', highlight_not_matched=u'[%s]',
         stopwords=STOPWORDS, _usecache=True):
     """
     Yield unicode strings corresponding to the full matched query text
@@ -1541,11 +1577,11 @@ def get_full_matched_text(
     matched line and the end of the last matched lines are also included in the
     returned text.
 
-    Each token is interpolated for "highlighting" and emphasis with the
-    `highlight_matched` format string for matched tokens or to the
-    `highlight_not_matched` for tokens not matched. The default is to enclose an
-    unmatched token sequence in [] square brackets. Punctuation is not
-    highlighted.
+    If `highlight` is True, each token is formatted for "highlighting" and
+    emphasis with the `highlight_matched` format string for matched tokens or to
+    the `highlight_not_matched` for tokens not matched. The default is to
+    enclose an unmatched token sequence in [] square brackets. Punctuation is
+    not highlighted.
     """
     if TRACE_MATCHED_TEXT:
         logger_debug('get_full_matched_text:  match:', match)
@@ -1564,7 +1600,7 @@ def get_full_matched_text(
         tokens = list(tokens)
         logger_debug('get_full_matched_text:  tokens:')
         for t in tokens:
-            print(t)
+            print('    ', t)
 
     tokens = reportable_tokens(
         tokens, match.qspan, match.start_line, match.end_line, whole_lines=whole_lines)
@@ -1576,16 +1612,21 @@ def get_full_matched_text(
             print(t)
 
     if TRACE_MATCHED_TEXT:
-        logger_debug('get_full_matched_text:  highlight_matched:', highlight_matched, 'highlight_not_matched:', highlight_not_matched)
+        logger_debug(
+            'get_full_matched_text:  highlight_matched:', highlight_matched,
+            'highlight_not_matched:', highlight_not_matched)
 
     # Finally yield strings with eventual highlightings
     for token in tokens:
         val = token.value
-        if token.is_text and val.lower() not in stopwords:
-            if token.is_matched:
-                yield highlight_matched % val
-            else:
-                yield highlight_not_matched % val
-        else:
-            # we do not highlight punctuation..
+        if not highlight:
             yield val
+        else:
+            if token.is_text and val.lower() not in stopwords:
+                if token.is_matched:
+                    yield highlight_matched % val
+                else:
+                    yield highlight_not_matched % val
+            else:
+                # we do not highlight punctuation..
+                yield val
diff --git a/src/licensedcode/tokenize.py b/src/licensedcode/tokenize.py
@@ -137,7 +137,7 @@ def matched_query_text_tokenizer(text):
     - True if the string is a text token or False if this is not
       (such as punctuation, spaces, etc).
     - the corresponding string.
-    This is used to reconstruct the matched query text accurately.
+    This is used to reconstruct the matched query text for reporting.
     """
     if not text:
         return
@@ -146,8 +146,13 @@ def matched_query_text_tokenizer(text):
             mgd = match.groupdict()
             token = mgd.get('token')
             punct = mgd.get('punct')
-            if token or punct:
-                yield (True, token) if token else (False, punct)
+            if token:
+                yield True, token
+            elif punct:
+                yield False, punct
+            else:
+                # this should never happen
+                raise Exception('Internal error in matched_query_text_tokenizer')
 
 
 def ngrams(iterable, ngram_length):
diff --git a/src/scancode/api.py b/src/scancode/api.py
@@ -194,16 +194,11 @@ def get_licenses(location, min_score=0,
 
     for match in matches:
         matched_text = None
-        # TODO: handle whole lines with the case of very long lines
         if include_text:
             if license_text_diagnostics:
-                matched_text = match.matched_text(whole_lines=False)
+                matched_text = match.matched_text(whole_lines=False, highlight=True)
             else:
-                highlight_not_matched = highlight_matched = u'%s'
-                matched_text = match.matched_text(
-                    highlight_matched=highlight_matched,
-                    highlight_not_matched=highlight_not_matched,
-                    whole_lines=True)
+                matched_text = match.matched_text(whole_lines=True, highlight=False)
 
         detected_expressions.append(match.rule.license_expression)
 
diff --git a/tests/licensedcode/data/match/unicode_text/main3.js b/tests/licensedcode/data/match/unicode_text/main3.js
@@ -0,0 +1 @@
+İ license MIT
diff --git a/tests/licensedcode/test_match.py b/tests/licensedcode/test_match.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 #
 # Copyright (c) 2017 nexB Inc. and others. All rights reserved.
 # http://nexb.com and https://github.com/nexB/scancode-toolkit/
@@ -911,9 +912,19 @@ def test_get_full_matched_text_base(self):
             THIS IS FROM THE CODEHAUS AND CONTRIBUTORS
             IN NO EVENT SHALL THE [best] CODEHAUS OR ITS CONTRIBUTORS BE LIABLE
             EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """
-        matched_text = u''.join(get_full_matched_text(match, query_string=querys, idx=idx, _usecache=False))
+        matched_text = u''.join(
+            get_full_matched_text(match, query_string=querys, idx=idx, _usecache=False))
         assert expected == matched_text
 
+        expected_nh = u"""Copyright 2003 (C) James. All Rights Reserved.
+            THIS IS FROM THE CODEHAUS AND CONTRIBUTORS
+            IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE
+            EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """
+        matched_text_nh = u''.join(
+            get_full_matched_text(
+                match, query_string=querys, idx=idx, _usecache=False, highlight=False))
+        assert expected_nh == matched_text_nh
+
         expected_origin_text = u"""Copyright 2003 (C) James. All Rights Reserved.
             THIS IS FROM THE CODEHAUS AND CONTRIBUTORS
             IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE
@@ -1173,3 +1184,34 @@ def test_matched_text_is_collected_correctly_end2end_for_spdx_match(self):
         results = [match.matched_text(_usecache=False) for match in idx.match(location=query_location)]
         expected = ['BSD-2-Clause-Patent']
         assert expected == results
+
+    def test_matched_text_is_not_truncated_with_unicode_diacritic_input_from_query(self):
+        idx = cache.get_index()
+        querys_with_diacritic_unicode = 'İ license MIT'
+        result = idx.match(query_string=querys_with_diacritic_unicode)
+        assert 1 == len(result)
+        match = result[0]
+        expected = 'license MIT'
+        matched_text = match.matched_text(_usecache=False,)
+        assert expected == matched_text
+
+    def test_matched_text_is_not_truncated_with_unicode_diacritic_input_from_file(self):
+        idx = cache.get_index()
+        file_with_diacritic_unicode_location = self.get_test_loc('match/unicode_text/main3.js')
+        result = idx.match(location=file_with_diacritic_unicode_location)
+        assert 1 == len(result)
+        match = result[0]
+        expected = 'license MIT'
+        matched_text = match.matched_text(_usecache=False)
+        assert expected == matched_text
+
+    def test_matched_text_is_not_truncated_with_unicode_diacritic_input_from_query_whole_lines(self):
+        idx = cache.get_index()
+        querys_with_diacritic_unicode = 'İ license MIT'
+        result = idx.match(query_string=querys_with_diacritic_unicode)
+        assert 1 == len(result)
+        match = result[0]
+        expected = '[İ] license MIT'
+        matched_text = match.matched_text(_usecache=False, whole_lines=True)
+        assert expected == matched_text
+