Skip to content

Commit 43773a0

Browse files
authored
Merge pull request #423 from nexB/260-license-text-capture
#260 license text capture
2 parents 9aba333 + 037cbe3 commit 43773a0

File tree

14 files changed

+429
-34
lines changed

14 files changed

+429
-34
lines changed

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ def read(*names, **kwargs):
7474
'bitarray >= 0.8.1, < 1.0.0',
7575
'intbitset >= 2.3.0, < 3.0',
7676
'pyahocorasick >= 1.1, < 1.2',
77+
'attrs >=16.0, < 17.0',
7778

7879
# caching
7980
'zc.lockfile >= 1.0.0, < 2.0.0',

src/licensedcode/match.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from licensedcode import query
3838
from licensedcode.spans import Span
3939
from licensedcode import MAX_DIST
40+
from licensedcode import tokenize
4041

4142
"""
4243
LicenseMatch data structure and matches merging and filtering routines.
@@ -1084,3 +1085,120 @@ def _debug_print_matched_query_text(match, query, extras=5):
10841085
logger_debug(' MATCHED QUERY TEXT with extras')
10851086
qt, _it = get_texts(new_match, location=query.location, query_string=None, idx=query.idx)
10861087
print(qt)
1088+
1089+
1090+
def get_full_matched_text(match, location=None, query_string=None, idx=None,
1091+
whole_lines=False,
1092+
highlight_matched=u'%s', highlight_not_matched=u'[%s]'):
1093+
"""
1094+
Yield a stream of unicode strings corresponding to the full matched matched query
1095+
text given a query file at `location` or a `query_string`, a `match` and an
1096+
`index`. This contains the full text including punctuations and spaces that are
1097+
not participating in the matche proper.
1098+
1099+
Each text token string is interpolated for optional highlighting with the
1100+
`highlight_matched` format string is matched or to the to the
1101+
`highlight_not_matched` is not matched.
1102+
Punctuation is not "highlighted".
1103+
1104+
Optionally if `whole_lines` is True, the unmatched part at the start of the first
1105+
matched line and the end of the last matched lines are included in the text.
1106+
"""
1107+
assert idx
1108+
dictionary_get = idx.dictionary.get
1109+
1110+
1111+
import attr
1112+
@attr.s(slots=True)
1113+
class Token(object):
1114+
value = attr.ib()
1115+
line_num = attr.ib()
1116+
pos = attr.ib(default=-1)
1117+
is_text = attr.ib(default=False)
1118+
is_included = attr.ib(default=False)
1119+
is_matched = attr.ib(default=False)
1120+
is_known = attr.ib(default=False)
1121+
1122+
1123+
def _tokenize(location, query_string):
1124+
"""Yield Tokens with pos and line number."""
1125+
_pos = -1
1126+
for _line_num, _line in enumerate(query.query_lines(location, query_string, strip=False), 1):
1127+
for _is_text, _token in tokenize.matched_query_text_tokenizer(_line):
1128+
_known = _is_text and dictionary_get(_token.lower()) is not None
1129+
_tok = Token(value=_token, line_num=_line_num, is_text=_is_text, is_known=_known)
1130+
if _known:
1131+
_pos += 1
1132+
_tok.pos = _pos
1133+
yield _tok
1134+
1135+
1136+
def _filter_unmatched_lines(tokens, _start_line, _end_line):
1137+
"""Skip lines that are not matched."""
1138+
for token in tokens:
1139+
if token.line_num < _start_line:
1140+
continue
1141+
if token.line_num > _end_line:
1142+
break
1143+
yield token
1144+
1145+
1146+
def _tag_tokens_as_matched(tokens, qspan):
1147+
"""Tag tokens that are matched with is_matched."""
1148+
for token in tokens:
1149+
if token.pos != -1 and token.is_known and token.pos in qspan:
1150+
token.is_matched = True
1151+
yield token
1152+
1153+
1154+
def _tag_tokens_as_included_in_whole_lines(tokens, _start_line, _end_line):
1155+
"""Tag all tokens in lines as included."""
1156+
for token in tokens:
1157+
if _start_line <= token.line_num <= _end_line:
1158+
token.is_included = True
1159+
yield token
1160+
1161+
1162+
def _tag_tokens_as_included_in_matched_range(tokens, _start, _end):
1163+
"""Tag tokens with start and end as included."""
1164+
started = False
1165+
finished = False
1166+
for token in tokens:
1167+
if not started and token.pos == _start:
1168+
started = True
1169+
1170+
if started and not finished:
1171+
token.is_included = True
1172+
1173+
yield token
1174+
1175+
if token.pos == _end:
1176+
finished = True
1177+
1178+
1179+
def _filter_non_included_tokens(tokens):
1180+
"""Skip non included tokens."""
1181+
for token in tokens:
1182+
if token.is_included:
1183+
yield token
1184+
1185+
# Create and process a stream of Tokens
1186+
tokenized = _tokenize(location, query_string)
1187+
in_line_range = _filter_unmatched_lines(tokenized, match.start_line, match.end_line)
1188+
matched = _tag_tokens_as_matched(in_line_range, match.qspan)
1189+
if whole_lines:
1190+
included = _tag_tokens_as_included_in_whole_lines(matched, match.start_line, match.end_line)
1191+
else:
1192+
included = _tag_tokens_as_included_in_matched_range(matched, match.qspan.start, match.qspan.end)
1193+
tokens = _filter_non_included_tokens(included)
1194+
1195+
# Finally yiled strings with eventual highlightings
1196+
for token in tokens:
1197+
if token.is_text:
1198+
if token.is_matched:
1199+
yield highlight_matched % token.value
1200+
else:
1201+
yield highlight_not_matched % token.value
1202+
else:
1203+
# punctuation
1204+
yield token.value

src/licensedcode/tokenize.py

Lines changed: 43 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,28 +40,36 @@
4040
for queries and rules texts.
4141
"""
4242

43-
44-
def query_lines(location=None, query_string=None):
43+
def query_lines(location=None, query_string=None, strip=True):
4544
"""
4645
Return an iterable of text lines given a file at `location` or a
47-
`query string`. Include empty lines.
46+
`query string`. Include empty lines.
4847
"""
4948
# TODO: OPTIMIZE: tokenizing line by line may be rather slow
5049
# we could instead get lines and tokens at once in a batch?
5150
lines = []
5251
if location:
5352
lines = text_lines(location, demarkup=False)
5453
elif query_string:
55-
lines = query_string.splitlines(False)
54+
if strip:
55+
keepends = False
56+
else:
57+
keepends = True
58+
lines = query_string.splitlines(keepends)
5659

5760
for line in lines:
58-
yield line.strip()
61+
if strip:
62+
yield line.strip()
63+
else:
64+
yield line
5965

6066

6167
# Split on whitespace and punctuations: keep only characters and +.
6268
# Keeping the + is important for licenses name such as GPL2+.
6369

64-
query_pattern = '[a-zA-Z0-9]+ ?\+|[^!\"#\$%&\'\(\)\*,\-\./:;<=>\?@\[\]\^_`\{\|\}\\\~\s\+\x92\x93\x94”“’–]'
70+
_letter_or_digit = '[a-zA-Z0-9]+ ?\+'
71+
_not_punctuation = '[^!\"#\$%&\'\(\)\*,\-\./:;<=>\?@\[\]\^_`\{\|\}\\\~\s\+\x92\x93\x94”“’–]'
72+
query_pattern = _letter_or_digit + '|' + _not_punctuation
6573
word_splitter = re.compile('(?:%s)+' % query_pattern, re.UNICODE).findall
6674

6775
def query_tokenizer(text, lower=True):
@@ -74,6 +82,33 @@ def query_tokenizer(text, lower=True):
7482
return (token for token in word_splitter(text) if token)
7583

7684

85+
# Alternate pattern used for matched text collection
86+
# collect tokens and non-token texts in two different groups
87+
_punctuation = '[!\"#\$%&\'\(\)\*,\-\./:;<=>\?@\[\]\^_`\{\|\}\\\~\s\+\x92\x93\x94”“’–]'
88+
_text_capture_pattern = '(?P<token>(?:' + query_pattern + ')+)' + '|' + '(?P<punct>' + _punctuation + '+)'
89+
tokens_and_non_tokens = re.compile(_text_capture_pattern, re.UNICODE).finditer
90+
91+
def matched_query_text_tokenizer(text):
92+
"""
93+
Return an iterable of tokens and non-tokens from a unicode query text keeping
94+
everything (including punctuations, line endings, etc.)
95+
The returned iterable contains 2-tuples of:
96+
- True if the string is a text token or False if this is not (such as punctuation, spaces, etc).
97+
- the corresponding string
98+
This is used to reconstruct the matched query text accurately.
99+
"""
100+
if not text:
101+
return
102+
for match in tokens_and_non_tokens(text):
103+
if not match:
104+
continue
105+
mgd = match.groupdict()
106+
token = mgd.get('token')
107+
punct = mgd.get('punct')
108+
if token or punct:
109+
yield (True, token) if token else (False, punct)
110+
111+
77112
# Template-aware splitter, keeping a templated part {{anything}} as a token.
78113
# This splitter yields plain token strings or double braces-enclosed strings
79114
# {{something}} for templates. curly barces are otherwise treated as punctuation.
@@ -140,7 +175,7 @@ def ngrams(iterable, ngram_length):
140175
[]
141176
142177
This also works with arrays or tuples:
143-
178+
144179
>>> from array import array
145180
>>> list(ngrams(array(b'h', [1,2,3,4,5]), 2))
146181
[(1, 2), (2, 3), (3, 4), (4, 5)]
@@ -156,7 +191,7 @@ def select_ngrams(ngrams, with_pos=False):
156191
Return an iterable as a subset of a sequence of ngrams using the hailstorm
157192
algorithm. If `with_pos` is True also include the starting position for the ngram
158193
in the original sequence.
159-
194+
160195
Definition from the paper: http://www2009.eprints.org/7/1/p61.pdf
161196
The algorithm first fingerprints every token and then selects a shingle s if
162197
the minimum fingerprint value of all k tokens in s occurs at the first or the

src/scancode/api.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def get_urls(location):
103103
DEJACODE_LICENSE_URL = 'https://enterprise.dejacode.com/urn/urn:dje:license:{}'
104104

105105

106-
def get_licenses(location, min_score=0, diag=False):
106+
def get_licenses(location, min_score=0, include_text=False, diag=False):
107107
"""
108108
Yield dictionaries of license data detected in the file at location.
109109
@@ -115,12 +115,16 @@ def get_licenses(location, min_score=0, diag=False):
115115
key of the returned mapping.
116116
"""
117117
from licensedcode.index import get_index
118+
from licensedcode.match import get_full_matched_text
118119
from licensedcode.models import get_licenses as licenses_details
119120

120121
idx = get_index()
121122
licenses = licenses_details()
122123

123124
for match in idx.match(location=location, min_score=min_score):
125+
if include_text:
126+
matched_text = u''.join(get_full_matched_text(match, location=location,
127+
idx=idx, whole_lines=False))
124128
for license_key in match.rule.licenses:
125129
lic = licenses.get(license_key)
126130
result = OrderedDict()
@@ -146,6 +150,8 @@ def get_licenses(location, min_score=0, diag=False):
146150
matched_rule['matched_length'] = match.ilen()
147151
matched_rule['match_coverage'] = match.coverage()
148152
matched_rule['rule_relevance'] = match.rule.relevance
153+
if include_text:
154+
result['matched_text'] = matched_text
149155
yield result
150156

151157

src/scancode/cli.py

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,8 @@ def validate_formats(ctx, param, value):
271271
@click.option('-i', '--info', is_flag=True, default=False, help='Include information such as size, type, etc.')
272272
@click.option('--license-score', is_flag=False, default=0, type=int, show_default=True,
273273
help='Do not return license matches with scores lower than this score. A number between 0 and 100.')
274+
@click.option('--license-text', is_flag=True, default=False,
275+
help='Include the detected licenses matched text. Has no effect unless --license is requested.')
274276

275277
@click.option('-f', '--format', is_flag=False, default='json', show_default=True, metavar='<style>',
276278
help=('Set <output_file> format <style> to one of the standard formats: %s '
@@ -289,14 +291,14 @@ def validate_formats(ctx, param, value):
289291
@click.option('--max-memory', is_flag=False, default=DEFAULT_MAX_MEMORY, type=int, show_default=True, help='Stop scanning a file if scanning requires more than a maximum amount of memory in megabytes.')
290292

291293
def scancode(ctx, input, output_file, copyright, license, package,
292-
email, url, info, license_score, format,
294+
email, url, info, license_score, license_text, format,
293295
verbose, quiet, processes,
294296
diag, timeout, max_memory,
295297
*args, **kwargs):
296298
"""scan the <input> file or directory for origin clues and license and save results to the <output_file>.
297299
298300
The scan results are printed to stdout if <output_file> is not provided.
299-
Error and progress is printed to stderr.
301+
Error and progress is printed to stderr.
300302
"""
301303
possible_scans = [copyright, license, package, email, url, info]
302304
# Default scan when no options is provided
@@ -307,9 +309,22 @@ def scancode(ctx, input, output_file, copyright, license, package,
307309

308310
scans_cache_class = get_scans_cache_class()
309311
try:
310-
files_count, results = scan(input, copyright, license, package, email, url, info, license_score,
311-
verbose, quiet, processes, scans_cache_class,
312-
diag, timeout, max_memory)
312+
files_count, results = scan(input_path=input,
313+
copyright=copyright,
314+
license=license,
315+
package=package,
316+
email=email,
317+
url=url,
318+
info=info,
319+
license_score=license_score,
320+
license_text=license_text,
321+
verbose=verbose,
322+
quiet=quiet,
323+
processes=processes,
324+
timeout=timeout, max_memory=max_memory,
325+
diag=diag,
326+
scans_cache_class=scans_cache_class,
327+
)
313328
if not quiet:
314329
echo_stderr('Saving results.', fg='green')
315330
save_results(files_count, results, format, input, output_file)
@@ -323,10 +338,14 @@ def scancode(ctx, input, output_file, copyright, license, package,
323338
# ctx.exit(rc)
324339

325340

326-
def scan(input_path, copyright=True, license=True, package=True,
327-
email=False, url=False, info=True, license_score=0,
328-
verbose=False, quiet=False, processes=1, scans_cache_class=None,
329-
diag=False, timeout=DEFAULT_TIMEOUT, max_memory=DEFAULT_MAX_MEMORY):
341+
def scan(input_path,
342+
copyright=True, license=True, package=True,
343+
email=False, url=False, info=True,
344+
license_score=0, license_text=False,
345+
verbose=False, quiet=False,
346+
processes=1, timeout=DEFAULT_TIMEOUT, max_memory=DEFAULT_MAX_MEMORY,
347+
diag=False,
348+
scans_cache_class=None):
330349
"""
331350
Return a tuple of (file_count, indexing_time, scan_results) where
332351
scan_results is an iterable. Run each requested scan proper: each individual file
@@ -337,7 +356,7 @@ def scan(input_path, copyright=True, license=True, package=True,
337356
scan_summary = OrderedDict()
338357
scan_summary['scanned_path'] = input_path
339358
scan_summary['processes'] = processes
340-
get_licenses_with_score = partial(get_licenses, min_score=license_score, diag=diag)
359+
get_licenses_with_score = partial(get_licenses, min_score=license_score, include_text=license_text, diag=diag)
341360

342361
# note: "flag and function" expressions return the function if flag is True
343362
# note: the order of the scans matters to show things in logical order

src/textcode/analysis.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -160,9 +160,10 @@ def as_unicode(line):
160160
return s
161161

162162

163-
def remove_verbatim_line_endings(s):
163+
def remove_verbatim_cr_lf_tab_chars(s):
164164
"""
165-
Return a string removing verbatim, escaped line endings (such as \n).
165+
Return a string replacinf by a space any verbatim but escaped line endings and
166+
tabs (such as a literal \n or \r \t).
166167
"""
167168
if not s:
168169
return s
@@ -179,7 +180,7 @@ def unicode_text_lines(location):
179180
if T.contains_text:
180181
with open(location, 'rbU') as f:
181182
for line in f:
182-
yield remove_verbatim_line_endings(as_unicode(line))
183+
yield remove_verbatim_cr_lf_tab_chars(as_unicode(line))
183184

184185

185186
def unicode_text(location):

0 commit comments

Comments
 (0)