@@ -419,6 +419,8 @@ def merge_matches(matches, max_dist=MAX_DIST):
419419 returned as-is.
420420 For being merged two matches must also be in increasing query and index positions.
421421 """
422+ from licensedcode .match_seq import MATCH_SEQ
423+
422424 # shortcut for single matches
423425 if len (matches ) < 2 :
424426 return matches
@@ -434,6 +436,8 @@ def merge_matches(matches, max_dist=MAX_DIST):
434436 merged = []
435437 for rid , rule_matches in matches_by_rule :
436438 if TRACE_MERGE : logger_debug ('merge_matches: processing rule:' , rid )
439+ rlen = rule_matches [0 ].rule .length
440+ max_rlen_dist = min (rlen // 5 , MAX_DIST )
437441
438442 # compare two matches in the sorted sequence: current and next
439443 i = 0
@@ -445,12 +449,22 @@ def merge_matches(matches, max_dist=MAX_DIST):
445449 if TRACE_MERGE : logger_debug ('---> merge_matches: current:' , current_match )
446450 if TRACE_MERGE : logger_debug ('---> merge_matches: next: ' , next_match )
447451
452+ # two exact matches can never be merged as they will not be overlapping
453+ if current_match .matcher != MATCH_SEQ and next_match .matcher != MATCH_SEQ :
454+ if TRACE_MERGE : logger_debug (' ---> ###merge_matches: both matches are EXACT_MATCHES, skipping' )
455+ break
456+
457+ # FIXME: also considers the match length!
448458 # stop if we exceed max dist
449- if (current_match .qdistance_to (next_match ) > MAX_DIST
450- or current_match .idistance_to (next_match ) > MAX_DIST ):
459+ # or distance over 1/5 of rule length
460+ if (current_match .qdistance_to (next_match ) > max_rlen_dist
461+ or current_match .idistance_to (next_match ) > max_rlen_dist ):
462+ if TRACE_MERGE : logger_debug (' ---> ###merge_matches: MAX_DIST reached, breaking' )
451463 break
452464
465+
453466 # keep one of equal matches
467+ # with same qspan: FIXME: is this ever possible?
454468 if current_match .qspan == next_match .qspan and current_match .ispan == next_match .ispan :
455469 if TRACE_MERGE : logger_debug (' ---> ###merge_matches: next EQUALS current, del next' )
456470 del rule_matches [j ]
@@ -507,6 +521,8 @@ def merge_matches(matches, max_dist=MAX_DIST):
507521 i -= 1
508522 break
509523
524+ # FIXME: what about the distance??
525+
510526 # next_match is strictly in increasing sequence: merge in current
511527 if next_match .is_after (current_match ):
512528 current_match .update (next_match )
@@ -536,6 +552,9 @@ def merge_matches(matches, max_dist=MAX_DIST):
536552 return merged
537553
538554
555+ # FIXME we should consider the length and distance between matches to break
556+ # early from the loops: trying to check containment on wildly separated matches does not make sense
557+
539558def filter_contained_matches (matches ):
540559 """
541560 Return a filtered list of LicenseMatch given a `matches` list of LicenseMatch by
@@ -578,9 +597,11 @@ def filter_contained_matches(matches):
578597 current_match = matches [i ]
579598 next_match = matches [j ]
580599
600+ # TODO: is this really correct?
581601 # stop when no overlap: Touching and overlapping matches have a zero distance.
582- # if current_match.qdistance_to(next_match):
583- # break
602+ if current_match .qdistance_to (next_match ):
603+ if TRACE_FILTER_CONTAINS : logger_debug (' ---> ###filter_contained_matches: matches have a distance: NO OVERLAP POSSIBLE\n ' )
604+ break
584605
585606 if TRACE_FILTER_CONTAINS : logger_debug ('---> filter_contained_matches: current: i=' , i , current_match )
586607 if TRACE_FILTER_CONTAINS : logger_debug ('---> filter_contained_matches: next: j=' , j , next_match )
@@ -760,9 +781,15 @@ def filter_rule_min_coverage(matches):
760781 Return a list of matches scoring at or above a rule-defined minimum coverage and
761782 a list of matches with a coverage below a rule-defined minimum coverage.
762783 """
784+ from licensedcode .match_seq import MATCH_SEQ
785+
763786 kept = []
764787 discarded = []
765788 for match in matches :
789+ # always keep exact matches
790+ if match .matcher != MATCH_SEQ :
791+ kept .append (match )
792+ continue
766793 if match .coverage () < match .rule .minimum_coverage :
767794 if TRACE_REFINE_RULE_MIN_COVERAGE : logger_debug (' ==> DISCARDING rule.minimum_coverage:' , type (match .rule .minimum_coverage ), ':' , repr (match .rule .minimum_coverage ), 'match:' , match )
768795 discarded .append (match )
@@ -798,6 +825,7 @@ def filter_spurious_single_token(matches, query=None, unknown_count=5):
798825 on both sides by at least `unknown_count` tokens of either unknown tokens, short
799826 tokens composed of a single character or tokens composed only of digits.
800827 """
828+ from licensedcode .match_seq import MATCH_SEQ
801829 kept = []
802830 discarded = []
803831 if not query :
@@ -809,6 +837,10 @@ def filter_spurious_single_token(matches, query=None, unknown_count=5):
809837 if not match .qlen () == 1 :
810838 kept .append (match )
811839 continue
840+ # always keep extact matches
841+ if match .matcher != MATCH_SEQ :
842+ kept .append (match )
843+ continue
812844
813845 qstart = match .qstart
814846 qend = match .qend
@@ -845,9 +877,15 @@ def filter_short_matches(matches):
845877 """
846878 Return a list of matches that are not short and a list of short spurious matches.
847879 """
880+ from licensedcode .match_seq import MATCH_SEQ
848881 kept = []
849882 discarded = []
850883 for match in matches :
884+ # always keep exact matches
885+ if match .matcher != MATCH_SEQ :
886+ kept .append (match )
887+ continue
888+
851889 if match .small ():
852890 if TRACE_REFINE_SMALL : logger_debug (' ==> DISCARDING SHORT:' , match )
853891 discarded .append (match )
@@ -864,10 +902,16 @@ def filter_spurious_matches(matches):
864902 Spurious matches are small matches with a low density (e.g. where the matched
865903 tokens are separated by many unmatched tokens.)
866904 """
905+ from licensedcode .match_seq import MATCH_SEQ
867906 kept = []
868907 discarded = []
869908
870909 for match in matches :
910+ # always keep exact matches
911+ if match .matcher != MATCH_SEQ :
912+ kept .append (match )
913+ continue
914+
871915 qdens = match .qspan .density ()
872916 idens = match .ispan .density ()
873917 ilen = match .ilen ()
@@ -911,11 +955,15 @@ def refine_matches(matches, idx, query=None, min_score=0, max_dist=MAX_DIST, fil
911955 if TRACE : logger_debug (' #####refine_matches: STARTING matches#' , len (matches ))
912956 if TRACE_REFINE : map (logger_debug , matches )
913957
914- matches = merge_matches (matches , max_dist = max_dist )
915- if TRACE : logger_debug (' ##### refine_matches: STARTING MERGED_matches#:' , len (matches ))
958+ if merge :
959+ matches = merge_matches (matches , max_dist = max_dist )
960+ if TRACE : logger_debug (' ##### refine_matches: STARTING MERGED_matches#:' , len (matches ))
916961
917962 all_discarded = []
918963
964+ # FIXME: we should have only a single loop on all the matches at once!!
965+ # and not 10's of loops!!!
966+
919967 matches , discarded = filter_rule_min_coverage (matches )
920968 all_discarded .extend (discarded )
921969 if TRACE : logger_debug (' #####refine_matches: NOT UNDER MIN COVERAGE #' , len (matches ))
@@ -971,7 +1019,8 @@ def refine_matches(matches, idx, query=None, min_score=0, max_dist=MAX_DIST, fil
9711019 if TRACE : logger_debug (' ###refine_matches: LOW SCORE discarded #:' , len (discarded ))
9721020 if TRACE_REFINE : map (logger_debug , discarded )
9731021
974- matches = merge_matches (matches , max_dist = max_dist )
1022+ if merge :
1023+ matches = merge_matches (matches , max_dist = max_dist )
9751024
9761025 logger_debug (' ##### refine_matches: FINAL MERGED_matches#:' , len (matches ))
9771026 if TRACE_REFINE : map (logger_debug , matches )
0 commit comments