Skip to content

Commit 9632433

Browse files
Fix unknown license detection #3343
Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
1 parent 1abf7e3 commit 9632433

File tree

2 files changed

+18
-13
lines changed

2 files changed

+18
-13
lines changed

src/licensedcode/match_unknown.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
TRACE = False
2424

2525
if TRACE:
26+
use_print = True
2627
import logging
2728
import sys
2829

@@ -31,6 +32,9 @@
3132
def logger_debug(*args):
3233
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))
3334

35+
if use_print:
36+
logger_debug = print
37+
3438
logging.basicConfig(stream=sys.stdout)
3539
logger.setLevel(logging.DEBUG)
3640

@@ -142,22 +146,25 @@ def match_unknowns(
142146
unknown_ngram_length=unknown_ngram_length,
143147
)
144148

149+
# build match from merged matched ngrams
150+
qspans = (Span(qstart, qend) for qstart, qend in matched_ngrams)
151+
qspan = Span().union(*qspans)
152+
145153
if TRACE:
146154
tokens_by_tid = idx.tokens_by_tid
147155

148156
def get_tokens(_toks):
149157
return (' '.join(tokens_by_tid[t] for t in _toks))
150158

151159
print('match_unknowns: matched_ngrams')
152-
for qstart, qend, matched_toks in matched_ngrams:
160+
161+
for qstart, qend in matched_ngrams:
162+
_span = Span(qstart, qend)
163+
_tokens = [query_tokens[qpos] for qpos in _span]
153164
print(
154165
' ', 'qstart', qstart,
155166
'qend', qend,
156-
'matched_toks', get_tokens(matched_toks))
157-
158-
# build match from merged matched ngrams
159-
qspans = (Span(qstart, qend) for qstart, qend in matched_ngrams)
160-
qspan = Span().union(*qspans)
167+
'matched_toks', get_tokens(_tokens))
161168

162169
if not qspan:
163170
return
@@ -169,7 +176,8 @@ def get_tokens(_toks):
169176
match_len = len(qspan)
170177

171178
if TRACE:
172-
print('match_unknowns: matched_span:', get_tokens(matched_tokens))
179+
#print('match_unknowns: matched_span:', get_tokens(matched_tokens))
180+
print('match_unknowns: qspan, match_len, matched_span:', qspan, match_len, matched_tokens)
173181

174182
# we use the query side to build the ispans
175183
ispan = Span(0, match_len)
@@ -180,9 +188,8 @@ def get_tokens(_toks):
180188
try:
181189
match_start_line = line_by_pos[qspan.start]
182190
match_end_line = line_by_pos[qspan.end]
183-
except:
184-
print('empty span:', qspan)
185-
raise
191+
except Exception as e:
192+
raise Exception('empty span:', qspan) from e
186193

187194
text = ''.join(get_full_qspan_matched_text(
188195
match_qspan=qspan,

src/licensedcode/models.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2542,14 +2542,12 @@ def __attrs_post_init__(self, *args, **kwargs):
25422542
self.identifier = f'license-detection-unknown-{self._unique_id}'
25432543

25442544
self.license_expression = UNKNOWN_LICENSE_KEY
2545-
# note that this could be shared across rules as an optimization
2545+
#TODO: that this could be shared across rules as an optimization
25462546
self.license_expression_object = self.licensing.parse(UNKNOWN_LICENSE_KEY)
25472547
self.is_license_notice = True
25482548
self.notes = 'Unknown license based on a composite of license words.'
25492549
self.is_synthetic = True
25502550
self.setup()
2551-
# called only for it's side effects
2552-
self.tokens()
25532551

25542552

25552553
@attr.s(slots=True, repr=False)

0 commit comments

Comments
 (0)