Skip to content

Commit 687cc7c

Browse files
committed
Rework handling of false_positive rules #712
* now detectd as regular exact match * part of filtering Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent a228723 commit 687cc7c

File tree

4 files changed

+57
-134
lines changed

4 files changed

+57
-134
lines changed

src/licensedcode/index.py

Lines changed: 23 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -155,8 +155,6 @@ class LicenseIndex(object):
155155
'small_rids',
156156
'negative_rids',
157157
'false_positive_rids',
158-
159-
'false_positive_rid_by_hash',
160158
'largest_false_positive_length',
161159

162160
'optimized',
@@ -202,7 +200,6 @@ def __init__(self, rules=None, _ranked_tokens=global_tokens_by_ranks):
202200
# (low_tids_mset, high_tids_mset)
203201
self.tids_msets_by_rid = []
204202

205-
# ---
206203
# mapping of hash -> single rid : duplicated rules are not allowed
207204
self.rid_by_hash = {}
208205

@@ -218,8 +215,6 @@ def __init__(self, rules=None, _ranked_tokens=global_tokens_by_ranks):
218215

219216
# length of the largest false_positive rule
220217
self.largest_false_positive_length = 0
221-
# mapping of hash -> rid for false positive rule tokens hashes
222-
self.false_positive_rid_by_hash = {}
223218

224219
# if True the index has been optimized and becomes read only:
225220
# no new rules can be added
@@ -274,7 +269,7 @@ def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks):
274269
self.false_positive_rids.add(rid)
275270
if rul_len > self.largest_false_positive_length:
276271
self.largest_false_positive_length = rul_len
277-
elif rul.negative():
272+
elif rul.negative:
278273
# negative rules are matched early and their exactly matched
279274
# tokens are removed from the token stream
280275
self.negative_rids.add(rid)
@@ -335,13 +330,10 @@ def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks):
335330
rule_hash = match_hash.index_hash(rule_token_ids)
336331
dupe_rules_by_hash[rule_hash].append(rule)
337332

338-
if rule.false_positive:
339-
# FP rules are not used for any matching
340-
# there is nothing else for these rules
341-
self.false_positive_rid_by_hash[rule_hash] = rid
342-
else:
343-
# negative, small and regular
333+
if rule.negative:
334+
negative_automaton_add(tids=rule_token_ids, rid=rid)
344335

336+
else:
345337
# update hashes index
346338
self.rid_by_hash[rule_hash] = rid
347339

@@ -362,19 +354,14 @@ def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks):
362354
self.tids_sets_by_rid[rid] = rlow_set, rhigh_set
363355
self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset
364356

365-
# populate automatons...
366-
if rule.negative():
367-
# ... with only the whole rule tokens sequence
368-
negative_automaton_add(tids=rule_token_ids, rid=rid)
369-
else:
370-
# ... or with the whole rule tokens sequence
371-
rules_automaton_add(tids=rule_token_ids, rid=rid)
372-
# ... and ngrams: compute ngrams and populate the automaton with ngrams
373-
if USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len(rule_token_ids) > NGRAM_LEN:
374-
all_ngrams = tokenize.ngrams(rule_token_ids, ngram_length=NGRAM_LEN)
375-
selected_ngrams = tokenize.select_ngrams(all_ngrams, with_pos=True)
376-
for pos, ngram in selected_ngrams:
377-
rules_automaton_add(tids=ngram, rid=rid, start=pos)
357+
# populate automaton with the whole rule tokens sequence
358+
rules_automaton_add(tids=rule_token_ids, rid=rid)
359+
# ... and ngrams: compute ngrams and populate the automaton with ngrams
360+
if USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len(rule_token_ids) > NGRAM_LEN:
361+
all_ngrams = tokenize.ngrams(rule_token_ids, ngram_length=NGRAM_LEN)
362+
selected_ngrams = tokenize.select_ngrams(all_ngrams, with_pos=True)
363+
for pos, ngram in selected_ngrams:
364+
rules_automaton_add(tids=ngram, rid=rid, start=pos)
378365

379366
# update rule thresholds
380367
rule.low_unique = match_set.tids_set_counter(rlow_set)
@@ -390,7 +377,6 @@ def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks):
390377

391378
# sparser dicts for faster lookup
392379
sparsify(self.rid_by_hash)
393-
sparsify(self.false_positive_rid_by_hash)
394380

395381
dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1]
396382
if dupe_rules:
@@ -457,22 +443,22 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
457443
return hash_matches
458444

459445
# negative rules exact matching
460-
negative = []
446+
negative_matches = []
461447
# note: detect_negative is false only to test negative rules detection proper
462448
if detect_negative and self.negative_rids:
463449
if TRACE: logger_debug('#match: NEGATIVE')
464-
negative = self.negative_match(whole_query_run)
465-
for neg in negative:
466-
if TRACE_NEGATIVE: self.debug_matches(negative, ' ##match: NEGATIVE subtracting #:', location, query_string)
450+
negative_matches = self.negative_match(whole_query_run)
451+
for neg in negative_matches:
452+
if TRACE_NEGATIVE: self.debug_matches(negative_matches, ' ##match: NEGATIVE subtracting #:', location, query_string)
467453
whole_query_run.subtract(neg.qspan)
468-
if TRACE_NEGATIVE: logger_debug(' #match: NEGATIVE found', negative)
454+
if TRACE_NEGATIVE: logger_debug(' #match: NEGATIVE found', negative_matches)
469455

470456
# exact matches
471457
if TRACE_EXACT: logger_debug('#match: EXACT')
472458
exact_matches = match_aho.exact_match(self, whole_query_run, self.rules_automaton)
473459
if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: EXACT matches#:', location, query_string)
474460

475-
exact_matches, exact_discarded = match.refine_matches(exact_matches, self, query=qry)
461+
exact_matches, exact_discarded = match.refine_matches(exact_matches, self, query=qry, filter_false_positive=False)
476462

477463
if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: ===> exact matches refined')
478464
if TRACE_EXACT: self.debug_matches(exact_discarded, ' #match: ===> exact matches discarded')
@@ -527,7 +513,7 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
527513
start_offset = 0
528514
while True:
529515
rule_matches = match_seq.match_sequence(self, candidate, query_run, start_offset=start_offset)
530-
if TRACE_QUERY_RUN and rule_matches:
516+
if TRACE_QUERY_RUN and rule_matches:
531517
self.debug_matches(rule_matches, ' #match: query_run: seq matches for candidate', with_text=True, query=qry)
532518
if not rule_matches:
533519
break
@@ -556,7 +542,7 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
556542
logger_debug('!!!!!!!!!!!!!!!!!!!!REFINING!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
557543
self.debug_matches(matches, '#match: ALL matches from all query runs', location, query_string)
558544

559-
matches, whole_discarded = match.refine_matches(matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2)
545+
matches, whole_discarded = match.refine_matches(matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2, filter_false_positive=True)
560546
if TRACE_MATCHES_DISCARD:
561547
discarded.extend(whole_discarded)
562548
matches.sort()
@@ -570,8 +556,9 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
570556

571557
def negative_match(self, query_run):
572558
"""
573-
Match a query run exactly against negative, license-less rules.
574-
Return a list of negative LicenseMatch for a query run, subtract these matches from the query run.
559+
Match a query run exactly against negative rules. Return a list
560+
of negative LicenseMatch for a query run, subtract these matches
561+
from the query run.
575562
"""
576563
matches = match_aho.exact_match(self, query_run, self.negative_automaton)
577564

@@ -604,8 +591,6 @@ def _print_index_stats(self):
604591
'negative_rids',
605592
'small_rids',
606593
'false_positive_rids',
607-
608-
'false_positive_rid_by_hash',
609594
]
610595

611596
plen = max(map(len, fields)) + 1

src/licensedcode/match.py

Lines changed: 21 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -23,21 +23,19 @@
2323
# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
2424

2525
from __future__ import absolute_import
26-
from __future__ import print_function
2726
from __future__ import division
27+
from __future__ import print_function
2828

29-
from array import array
30-
from functools import partial
31-
from functools import total_ordering
32-
from hashlib import md5
3329
from itertools import chain
3430
from itertools import groupby
31+
from functools import partial
32+
from functools import total_ordering
3533
import textwrap
3634

37-
from licensedcode import query
38-
from licensedcode.spans import Span
3935
from licensedcode import MAX_DIST
36+
from licensedcode import query
4037
from licensedcode import tokenize
38+
from licensedcode.spans import Span
4139

4240
"""
4341
LicenseMatch data structure and matches merging and filtering routines.
@@ -381,23 +379,6 @@ def small(self):
381379

382380
return False
383381

384-
def false_positive(self, idx):
385-
"""
386-
Return a True-ish (e.g. a false positive rule id) if the LicenseMatch match
387-
is a false positive or None otherwise (nb: not False). This is done by a
388-
lookup of the matched tokens sequence against the `idx` index false positive
389-
rules.
390-
"""
391-
ilen = self.ilen()
392-
if ilen > idx.largest_false_positive_length:
393-
return
394-
rule_tokens = idx.tids_by_rid[self.rule.rid]
395-
ispan = self.ispan
396-
matched_itokens = array('h', (tid for ipos, tid in enumerate(rule_tokens) if ipos in ispan))
397-
# note: hash computation is inlined here but MUST be the same code as in match_hash
398-
matched_hash = md5(matched_itokens.tostring()).digest()
399-
return idx.false_positive_rid_by_hash.get(matched_hash)
400-
401382
def matched_text(self, whole_lines=False,
402383
highlight_matched=u'%s', highlight_not_matched=u'[%s]'):
403384
"""
@@ -904,25 +885,24 @@ def filter_spurious_matches(matches):
904885
return kept, discarded
905886

906887

907-
def filter_false_positive_matches(matches, idx):
888+
def filter_false_positive_matches(matches):
908889
"""
909-
Return a list of matches that are not false positives and a list of false
910-
positive matches given an index `idx`.
890+
Return a list of matches that are not false positives and a list of
891+
false positive matches.
911892
"""
912893
kept = []
913894
discarded = []
914895
for match in matches:
915-
fp = match.false_positive(idx)
916-
if fp is None:
896+
if match.rule.false_positive:
897+
if TRACE_REFINE: logger_debug(' ==> DISCARDING FALSE POSITIVE:', match)
898+
discarded.append(match)
899+
else:
917900
# if TRACE_REFINE: logger_debug(' ==> NOT DISCARDING FALSE POSITIVE:', match)
918901
kept.append(match)
919-
else:
920-
if TRACE_REFINE: logger_debug(' ==> DISCARDING FALSE POSITIVE:', match, 'fp rule:', idx.rules_by_rid[fp].identifier)
921-
discarded.append(match)
922902
return kept, discarded
923903

924904

925-
def refine_matches(matches, idx, query=None, min_score=0, max_dist=MAX_DIST):
905+
def refine_matches(matches, idx, query=None, min_score=0, max_dist=MAX_DIST, filter_false_positive=True):
926906
"""
927907
Return two sequences of matches: one contains refined good matches, and the
928908
other contains matches that were filtered out.
@@ -957,13 +937,6 @@ def refine_matches(matches, idx, query=None, min_score=0, max_dist=MAX_DIST):
957937
if TRACE: logger_debug(' #####refine_matches: SHORT discarded#', len(discarded))
958938
if TRACE_REFINE: map(logger_debug, discarded)
959939

960-
matches, discarded = filter_false_positive_matches(matches, idx)
961-
all_discarded.extend(discarded)
962-
if TRACE: logger_debug(' #####refine_matches: NOT FALSE POS #', len(matches))
963-
if TRACE_REFINE: map(logger_debug, matches)
964-
if TRACE: logger_debug(' #####refine_matches: FALSE POS discarded#', len(discarded))
965-
if TRACE_REFINE: map(logger_debug, discarded)
966-
967940
matches, discarded = filter_spurious_matches(matches)
968941
all_discarded.extend(discarded)
969942
if TRACE: logger_debug(' #####refine_matches: NOT SPURIOUS#', len(matches))
@@ -982,6 +955,14 @@ def refine_matches(matches, idx, query=None, min_score=0, max_dist=MAX_DIST):
982955
if TRACE: logger_debug(' #####refine_matches: FILTERED discarded#', len(discarded))
983956
if TRACE_REFINE: map(logger_debug, discarded)
984957

958+
if filter_false_positive:
959+
matches, discarded = filter_false_positive_matches(matches)
960+
all_discarded.extend(discarded)
961+
if TRACE: logger_debug(' #####refine_matches: NOT FALSE POS #', len(matches))
962+
if TRACE_REFINE: map(logger_debug, matches)
963+
if TRACE: logger_debug(' #####refine_matches: FALSE POS discarded#', len(discarded))
964+
if TRACE_REFINE: map(logger_debug, discarded)
965+
985966
if min_score:
986967
matches, discarded = filter_low_score(matches, min_score=min_score)
987968
all_discarded.extend(discarded)

0 commit comments

Comments
 (0)