Skip to content

Commit b4e112f

Browse files
committed
Merge changes from #695
Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent 09a3581 commit b4e112f

File tree

3 files changed

+61
-8
lines changed

3 files changed

+61
-8
lines changed

src/licensedcode/index.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,10 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
499499
matches.extend(hash_matches)
500500
continue
501501

502+
# FIXME: why do not we aho match again here? This would avoid
503+
# going into the costly set and seq re-match that may not be needed at all
504+
# alternatively we should consider aho matches to excludes them from candidates
505+
502506
# query run match proper using sequence matching
503507
#########################################
504508
if TRACE: logger_debug(' #match: Query run MATCHING proper....')

src/licensedcode/match.py

Lines changed: 56 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,8 @@ def merge_matches(matches, max_dist=MAX_DIST):
419419
returned as-is.
420420
For being merged two matches must also be in increasing query and index positions.
421421
"""
422+
from licensedcode.match_seq import MATCH_SEQ
423+
422424
# shortcut for single matches
423425
if len(matches) < 2:
424426
return matches
@@ -434,6 +436,8 @@ def merge_matches(matches, max_dist=MAX_DIST):
434436
merged = []
435437
for rid, rule_matches in matches_by_rule:
436438
if TRACE_MERGE: logger_debug('merge_matches: processing rule:', rid)
439+
rlen = rule_matches[0].rule.length
440+
max_rlen_dist = min(rlen // 5, MAX_DIST)
437441

438442
# compare two matches in the sorted sequence: current and next
439443
i = 0
@@ -445,12 +449,22 @@ def merge_matches(matches, max_dist=MAX_DIST):
445449
if TRACE_MERGE: logger_debug('---> merge_matches: current:', current_match)
446450
if TRACE_MERGE: logger_debug('---> merge_matches: next: ', next_match)
447451

452+
# two exact matches can never be merged as they will not be overlapping
453+
if current_match.matcher != MATCH_SEQ and next_match.matcher != MATCH_SEQ:
454+
if TRACE_MERGE: logger_debug(' ---> ###merge_matches: both matches are EXACT_MATCHES, skipping')
455+
break
456+
457+
# FIXME: also considers the match length!
448458
# stop if we exceed max dist
449-
if (current_match.qdistance_to(next_match) > MAX_DIST
450-
or current_match.idistance_to(next_match) > MAX_DIST):
459+
# or distance over 1/5 of rule length
460+
if (current_match.qdistance_to(next_match) > max_rlen_dist
461+
or current_match.idistance_to(next_match) > max_rlen_dist):
462+
if TRACE_MERGE: logger_debug(' ---> ###merge_matches: MAX_DIST reached, breaking')
451463
break
452464

465+
453466
# keep one of equal matches
467+
# with same qspan: FIXME: is this ever possible?
454468
if current_match.qspan == next_match.qspan and current_match.ispan == next_match.ispan:
455469
if TRACE_MERGE: logger_debug(' ---> ###merge_matches: next EQUALS current, del next')
456470
del rule_matches[j]
@@ -507,6 +521,8 @@ def merge_matches(matches, max_dist=MAX_DIST):
507521
i -= 1
508522
break
509523

524+
# FIXME: what about the distance??
525+
510526
# next_match is strictly in increasing sequence: merge in current
511527
if next_match.is_after(current_match):
512528
current_match.update(next_match)
@@ -536,6 +552,9 @@ def merge_matches(matches, max_dist=MAX_DIST):
536552
return merged
537553

538554

555+
# FIXME we should consider the length and distance between matches to break
556+
# early from the loops: trying to check containment on wildly separated matches does not make sense
557+
539558
def filter_contained_matches(matches):
540559
"""
541560
Return a filtered list of LicenseMatch given a `matches` list of LicenseMatch by
@@ -578,9 +597,11 @@ def filter_contained_matches(matches):
578597
current_match = matches[i]
579598
next_match = matches[j]
580599

600+
# TODO: is this really correct?
581601
# stop when no overlap: Touching and overlapping matches have a zero distance.
582-
# if current_match.qdistance_to(next_match):
583-
# break
602+
if current_match.qdistance_to(next_match):
603+
if TRACE_FILTER_CONTAINS: logger_debug(' ---> ###filter_contained_matches: matches have a distance: NO OVERLAP POSSIBLE\n')
604+
break
584605

585606
if TRACE_FILTER_CONTAINS: logger_debug('---> filter_contained_matches: current: i=', i, current_match)
586607
if TRACE_FILTER_CONTAINS: logger_debug('---> filter_contained_matches: next: j=', j, next_match)
@@ -760,9 +781,15 @@ def filter_rule_min_coverage(matches):
760781
Return a list of matches scoring at or above a rule-defined minimum coverage and
761782
a list of matches with a coverage below a rule-defined minimum coverage.
762783
"""
784+
from licensedcode.match_seq import MATCH_SEQ
785+
763786
kept = []
764787
discarded = []
765788
for match in matches:
789+
# always keep exact matches
790+
if match.matcher != MATCH_SEQ:
791+
kept.append(match)
792+
continue
766793
if match.coverage() < match.rule.minimum_coverage:
767794
if TRACE_REFINE_RULE_MIN_COVERAGE: logger_debug(' ==> DISCARDING rule.minimum_coverage:', type(match.rule.minimum_coverage), ':', repr(match.rule.minimum_coverage), 'match:', match)
768795
discarded.append(match)
@@ -798,6 +825,7 @@ def filter_spurious_single_token(matches, query=None, unknown_count=5):
798825
on both sides by at least `unknown_count` tokens of either unknown tokens, short
799826
tokens composed of a single character or tokens composed only of digits.
800827
"""
828+
from licensedcode.match_seq import MATCH_SEQ
801829
kept = []
802830
discarded = []
803831
if not query:
@@ -809,6 +837,10 @@ def filter_spurious_single_token(matches, query=None, unknown_count=5):
809837
if not match.qlen() == 1:
810838
kept.append(match)
811839
continue
840+
# always keep extact matches
841+
if match.matcher != MATCH_SEQ:
842+
kept.append(match)
843+
continue
812844

813845
qstart = match.qstart
814846
qend = match.qend
@@ -845,9 +877,15 @@ def filter_short_matches(matches):
845877
"""
846878
Return a list of matches that are not short and a list of short spurious matches.
847879
"""
880+
from licensedcode.match_seq import MATCH_SEQ
848881
kept = []
849882
discarded = []
850883
for match in matches:
884+
# always keep exact matches
885+
if match.matcher != MATCH_SEQ:
886+
kept.append(match)
887+
continue
888+
851889
if match.small():
852890
if TRACE_REFINE_SMALL: logger_debug(' ==> DISCARDING SHORT:', match)
853891
discarded.append(match)
@@ -864,10 +902,16 @@ def filter_spurious_matches(matches):
864902
Spurious matches are small matches with a low density (e.g. where the matched
865903
tokens are separated by many unmatched tokens.)
866904
"""
905+
from licensedcode.match_seq import MATCH_SEQ
867906
kept = []
868907
discarded = []
869908

870909
for match in matches:
910+
# always keep exact matches
911+
if match.matcher != MATCH_SEQ:
912+
kept.append(match)
913+
continue
914+
871915
qdens = match.qspan.density()
872916
idens = match.ispan.density()
873917
ilen = match.ilen()
@@ -911,11 +955,15 @@ def refine_matches(matches, idx, query=None, min_score=0, max_dist=MAX_DIST, fil
911955
if TRACE: logger_debug(' #####refine_matches: STARTING matches#', len(matches))
912956
if TRACE_REFINE: map(logger_debug, matches)
913957

914-
matches = merge_matches(matches, max_dist=max_dist)
915-
if TRACE: logger_debug(' ##### refine_matches: STARTING MERGED_matches#:', len(matches))
958+
if merge:
959+
matches = merge_matches(matches, max_dist=max_dist)
960+
if TRACE: logger_debug(' ##### refine_matches: STARTING MERGED_matches#:', len(matches))
916961

917962
all_discarded = []
918963

964+
# FIXME: we should have only a single loop on all the matches at once!!
965+
# and not 10's of loops!!!
966+
919967
matches, discarded = filter_rule_min_coverage(matches)
920968
all_discarded.extend(discarded)
921969
if TRACE: logger_debug(' #####refine_matches: NOT UNDER MIN COVERAGE #', len(matches))
@@ -971,7 +1019,8 @@ def refine_matches(matches, idx, query=None, min_score=0, max_dist=MAX_DIST, fil
9711019
if TRACE: logger_debug(' ###refine_matches: LOW SCORE discarded #:', len(discarded))
9721020
if TRACE_REFINE: map(logger_debug, discarded)
9731021

974-
matches = merge_matches(matches, max_dist=max_dist)
1022+
if merge:
1023+
matches = merge_matches(matches, max_dist=max_dist)
9751024

9761025
logger_debug(' ##### refine_matches: FINAL MERGED_matches#:', len(matches))
9771026
if TRACE_REFINE: map(logger_debug, matches)

src/licensedcode/match_set.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ def compute_candidates(query_run, idx, rules_subset, top=30):
302302
tops = [rule.identifier for _rid, rule, _inter in candidates[:10]]
303303
logger_debug(tops)
304304

305-
# discard false positive rules from candidates: we never want to to
305+
# discard false positive rules from candidates: we never want to run
306306
# a sequence match on these
307307
candidates = [(rid, rule, inter) for (rid, rule, inter) in candidates if not rule.false_positive]
308308

0 commit comments

Comments
 (0)