Skip to content

Commit b002083

Browse files
committed
Correct license text collection bug
The license text was truncted in some cases because of unicode diacritics behaving slightly differently in regular expression when lowercased or or not. This led to certain lowercased diacritics not being the same with the regular query tokenizer an the matched text tokenizer and leading to an off by one error and a truncation of the collected license text. This fix changes how the check is done to verify if a token is a known token or not. This also introduce a "highlight" argument to the LicenseMatch.matched_text() methods and related functions Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent a470767 commit b002083

File tree

5 files changed

+119
-35
lines changed

5 files changed

+119
-35
lines changed

src/licensedcode/match.py

Lines changed: 65 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from licensedcode.spans import Span
3838
from licensedcode.stopwords import STOPWORDS
3939
from licensedcode.tokenize import matched_query_text_tokenizer
40+
from licensedcode.tokenize import query_tokenizer
4041

4142

4243
"""
@@ -532,7 +533,7 @@ def itokens_hash(self, idx):
532533

533534
# FIXME: this should be done for all the matches found in a given scanned
534535
# location at once to avoid reprocessing many times the original text
535-
def matched_text(self, whole_lines=False,
536+
def matched_text(self, whole_lines=False, highlight=True,
536537
highlight_matched=u'%s', highlight_not_matched=u'[%s]',
537538
_usecache=True):
538539
"""
@@ -558,6 +559,7 @@ def matched_text(self, whole_lines=False,
558559
query_string=query.query_string,
559560
idx=query.idx,
560561
whole_lines=whole_lines,
562+
highlight=highlight,
561563
highlight_matched=highlight_matched,
562564
highlight_not_matched=highlight_not_matched, _usecache=_usecache)
563565
).rstrip()
@@ -1384,15 +1386,16 @@ def _log(_matches, _discarded, msg):
13841386
@attr.s(slots=True, frozen=True)
13851387
class Token(object):
13861388
"""
1387-
Used to represent a token in collected matched texts and SPDX identifiers.
1389+
Used to represent a token in collected query-side matched texts and SPDX
1390+
identifiers.
13881391
"""
13891392
# original text value for this token.
13901393
value = attr.ib()
13911394
# line number, one-based
13921395
line_num = attr.ib()
13931396
# absolute position for known tokens, zero-based. -1 for unknown tokens
13941397
pos = attr.ib(default=-1)
1395-
# False if this is punctuation
1398+
# False if this is punctuation or spaces
13961399
is_text = attr.ib(default=False)
13971400
# True if part of a match
13981401
is_matched = attr.ib(default=False)
@@ -1405,6 +1408,8 @@ def tokenize_matched_text(location, query_string, dictionary, _cache={}):
14051408
Return a list of Token objects with pos and line number collected from the
14061409
file at `location` or the `query_string` string. `dictionary` is the index
14071410
mapping of tokens to token ids.
1411+
1412+
NOTE: the _cache={} arg IS A GLOBAL by design.
14081413
"""
14091414
key = location, query_string
14101415
cached = _cache.get(key)
@@ -1425,26 +1430,52 @@ def _tokenize_matched_text(location, query_string, dictionary):
14251430
"""
14261431
pos = -1
14271432
for line_num, line in query.query_lines(location, query_string, strip=False):
1433+
if TRACE_MATCHED_TEXT_DETAILS:
1434+
logger_debug(' _tokenize_matched_text:',
1435+
'line_num:', line_num,
1436+
'line:', line)
1437+
14281438
for is_text, token_str in matched_query_text_tokenizer(line):
1429-
known = token_str.lower() in dictionary
1439+
if TRACE_MATCHED_TEXT_DETAILS:
1440+
logger_debug(' is_text:', is_text, 'token_str:', repr(token_str))
1441+
known = False
1442+
if token_str and token_str.strip():
1443+
# we retokenzie using the query tokenize
1444+
tokenized = list(query_tokenizer(token_str))
1445+
if tokenized:
1446+
assert len(tokenized) == 1, repr((is_text, token_str, tokenized))
1447+
tokenized = tokenized[0]
1448+
known = tokenized in dictionary
1449+
14301450
if known:
14311451
pos += 1
14321452
p = pos
14331453
else:
14341454
p = -1
1435-
yield Token(
1455+
1456+
tok = Token(
14361457
value=token_str,
14371458
line_num=line_num,
14381459
is_text=is_text,
14391460
is_known=known,
14401461
pos=p)
14411462

1463+
if TRACE_MATCHED_TEXT_DETAILS:
1464+
logger_debug(' token:', tok)
1465+
yield tok
1466+
14421467

14431468
def reportable_tokens(tokens, match_qspan, start_line, end_line, whole_lines=False):
14441469
"""
1445-
Yield Tokens from an iterable of `tokens` that are inside a `match_qspan`
1446-
matched Span starting at `start_line` and ending at `end_line`. Known
1447-
matched tokens are tagged as is_matched=True.
1470+
Yield Tokens from a `tokens` iterable of Token objects (built from a query-
1471+
side scanned file or string) that are inside a `match_qspan` matched Span
1472+
starting at `start_line` and ending at `end_line`. If whole_lines is True,
1473+
also yield unmatched Tokens that are before and after the match and on the
1474+
first and last line of a match (unless the lines are very long text lines or
1475+
the match is from binary content.)
1476+
1477+
As a side effect, known matched tokens are tagged as is_matched=True if they
1478+
are matched.
14481479
14491480
If `whole_lines` is True, any token within matched lines range is included.
14501481
Otherwise, a token is included if its position is within the matched
@@ -1475,7 +1506,12 @@ def reportable_tokens(tokens, match_qspan, start_line, end_line, whole_lines=Fal
14751506
tok = attr.evolve(tok, is_matched=True)
14761507
is_included = True
14771508
if TRACE_MATCHED_TEXT_DETAILS:
1478-
logger_debug(' tok.is_matched = True')
1509+
logger_debug(' tok.is_matched = True', 'match_qspan:', match_qspan)
1510+
else:
1511+
if TRACE_MATCHED_TEXT_DETAILS:
1512+
logger_debug(' unmatched token: tok.is_matched = False',
1513+
'match_qspan:', match_qspan,
1514+
'tok.pos in match_qspan:', tok.pos in match_qspan)
14791515

14801516
if whole_lines:
14811517
# we only work on matched lines so no need to test further
@@ -1527,7 +1563,7 @@ def reportable_tokens(tokens, match_qspan, start_line, end_line, whole_lines=Fal
15271563
def get_full_matched_text(
15281564
match, location=None, query_string=None, idx=None,
15291565
whole_lines=False,
1530-
highlight_matched=u'%s', highlight_not_matched=u'[%s]',
1566+
highlight=True, highlight_matched=u'%s', highlight_not_matched=u'[%s]',
15311567
stopwords=STOPWORDS, _usecache=True):
15321568
"""
15331569
Yield unicode strings corresponding to the full matched query text
@@ -1541,11 +1577,11 @@ def get_full_matched_text(
15411577
matched line and the end of the last matched lines are also included in the
15421578
returned text.
15431579
1544-
Each token is interpolated for "highlighting" and emphasis with the
1545-
`highlight_matched` format string for matched tokens or to the
1546-
`highlight_not_matched` for tokens not matched. The default is to enclose an
1547-
unmatched token sequence in [] square brackets. Punctuation is not
1548-
highlighted.
1580+
If `highlight` is True, each token is formatted for "highlighting" and
1581+
emphasis with the `highlight_matched` format string for matched tokens or to
1582+
the `highlight_not_matched` for tokens not matched. The default is to
1583+
enclose an unmatched token sequence in [] square brackets. Punctuation is
1584+
not highlighted.
15491585
"""
15501586
if TRACE_MATCHED_TEXT:
15511587
logger_debug('get_full_matched_text: match:', match)
@@ -1564,7 +1600,7 @@ def get_full_matched_text(
15641600
tokens = list(tokens)
15651601
logger_debug('get_full_matched_text: tokens:')
15661602
for t in tokens:
1567-
print(t)
1603+
print(' ', t)
15681604

15691605
tokens = reportable_tokens(
15701606
tokens, match.qspan, match.start_line, match.end_line, whole_lines=whole_lines)
@@ -1576,16 +1612,21 @@ def get_full_matched_text(
15761612
print(t)
15771613

15781614
if TRACE_MATCHED_TEXT:
1579-
logger_debug('get_full_matched_text: highlight_matched:', highlight_matched, 'highlight_not_matched:', highlight_not_matched)
1615+
logger_debug(
1616+
'get_full_matched_text: highlight_matched:', highlight_matched,
1617+
'highlight_not_matched:', highlight_not_matched)
15801618

15811619
# Finally yield strings with eventual highlightings
15821620
for token in tokens:
15831621
val = token.value
1584-
if token.is_text and val.lower() not in stopwords:
1585-
if token.is_matched:
1586-
yield highlight_matched % val
1587-
else:
1588-
yield highlight_not_matched % val
1589-
else:
1590-
# we do not highlight punctuation..
1622+
if not highlight:
15911623
yield val
1624+
else:
1625+
if token.is_text and val.lower() not in stopwords:
1626+
if token.is_matched:
1627+
yield highlight_matched % val
1628+
else:
1629+
yield highlight_not_matched % val
1630+
else:
1631+
# we do not highlight punctuation..
1632+
yield val

src/licensedcode/tokenize.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ def matched_query_text_tokenizer(text):
137137
- True if the string is a text token or False if this is not
138138
(such as punctuation, spaces, etc).
139139
- the corresponding string.
140-
This is used to reconstruct the matched query text accurately.
140+
This is used to reconstruct the matched query text for reporting.
141141
"""
142142
if not text:
143143
return
@@ -146,8 +146,13 @@ def matched_query_text_tokenizer(text):
146146
mgd = match.groupdict()
147147
token = mgd.get('token')
148148
punct = mgd.get('punct')
149-
if token or punct:
150-
yield (True, token) if token else (False, punct)
149+
if token:
150+
yield True, token
151+
elif punct:
152+
yield False, punct
153+
else:
154+
# this should never happen
155+
raise Exception('Internal error in matched_query_text_tokenizer')
151156

152157

153158
def ngrams(iterable, ngram_length):

src/scancode/api.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -194,16 +194,11 @@ def get_licenses(location, min_score=0,
194194

195195
for match in matches:
196196
matched_text = None
197-
# TODO: handle whole lines with the case of very long lines
198197
if include_text:
199198
if license_text_diagnostics:
200-
matched_text = match.matched_text(whole_lines=False)
199+
matched_text = match.matched_text(whole_lines=False, highlight=True)
201200
else:
202-
highlight_not_matched = highlight_matched = u'%s'
203-
matched_text = match.matched_text(
204-
highlight_matched=highlight_matched,
205-
highlight_not_matched=highlight_not_matched,
206-
whole_lines=True)
201+
matched_text = match.matched_text(whole_lines=True, highlight=False)
207202

208203
detected_expressions.append(match.rule.license_expression)
209204

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
İ license MIT

tests/licensedcode/test_match.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# -*- coding: utf-8 -*-
12
#
23
# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
34
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
@@ -911,9 +912,19 @@ def test_get_full_matched_text_base(self):
911912
THIS IS FROM THE CODEHAUS AND CONTRIBUTORS
912913
IN NO EVENT SHALL THE [best] CODEHAUS OR ITS CONTRIBUTORS BE LIABLE
913914
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """
914-
matched_text = u''.join(get_full_matched_text(match, query_string=querys, idx=idx, _usecache=False))
915+
matched_text = u''.join(
916+
get_full_matched_text(match, query_string=querys, idx=idx, _usecache=False))
915917
assert expected == matched_text
916918

919+
expected_nh = u"""Copyright 2003 (C) James. All Rights Reserved.
920+
THIS IS FROM THE CODEHAUS AND CONTRIBUTORS
921+
IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE
922+
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """
923+
matched_text_nh = u''.join(
924+
get_full_matched_text(
925+
match, query_string=querys, idx=idx, _usecache=False, highlight=False))
926+
assert expected_nh == matched_text_nh
927+
917928
expected_origin_text = u"""Copyright 2003 (C) James. All Rights Reserved.
918929
THIS IS FROM THE CODEHAUS AND CONTRIBUTORS
919930
IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE
@@ -1173,3 +1184,34 @@ def test_matched_text_is_collected_correctly_end2end_for_spdx_match(self):
11731184
results = [match.matched_text(_usecache=False) for match in idx.match(location=query_location)]
11741185
expected = ['BSD-2-Clause-Patent']
11751186
assert expected == results
1187+
1188+
def test_matched_text_is_not_truncated_with_unicode_diacritic_input_from_query(self):
1189+
idx = cache.get_index()
1190+
querys_with_diacritic_unicode = 'İ license MIT'
1191+
result = idx.match(query_string=querys_with_diacritic_unicode)
1192+
assert 1 == len(result)
1193+
match = result[0]
1194+
expected = 'license MIT'
1195+
matched_text = match.matched_text(_usecache=False,)
1196+
assert expected == matched_text
1197+
1198+
def test_matched_text_is_not_truncated_with_unicode_diacritic_input_from_file(self):
1199+
idx = cache.get_index()
1200+
file_with_diacritic_unicode_location = self.get_test_loc('match/unicode_text/main3.js')
1201+
result = idx.match(location=file_with_diacritic_unicode_location)
1202+
assert 1 == len(result)
1203+
match = result[0]
1204+
expected = 'license MIT'
1205+
matched_text = match.matched_text(_usecache=False)
1206+
assert expected == matched_text
1207+
1208+
def test_matched_text_is_not_truncated_with_unicode_diacritic_input_from_query_whole_lines(self):
1209+
idx = cache.get_index()
1210+
querys_with_diacritic_unicode = 'İ license MIT'
1211+
result = idx.match(query_string=querys_with_diacritic_unicode)
1212+
assert 1 == len(result)
1213+
match = result[0]
1214+
expected = '[İ] license MIT'
1215+
matched_text = match.matched_text(_usecache=False, whole_lines=True)
1216+
assert expected == matched_text
1217+

0 commit comments

Comments
 (0)