Skip to content

Commit 590f4cd

Browse files
committed
Use 1/2 of rule len as LicenseMatch max merge dist
* 1/5th was too small * do not break out of merge loop if two matches are not seq matches Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent 72e4bb2 commit 590f4cd

File tree

2 files changed

+20
-6
lines changed

2 files changed

+20
-6
lines changed

src/licensedcode/match.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -437,7 +437,7 @@ def merge_matches(matches, max_dist=MAX_DIST):
437437
for rid, rule_matches in matches_by_rule:
438438
if TRACE_MERGE: logger_debug('merge_matches: processing rule:', rid)
439439
rlen = rule_matches[0].rule.length
440-
max_rlen_dist = min((rlen // 5) or 1, MAX_DIST)
440+
max_rlen_dist = min((rlen // 2) or 1, MAX_DIST)
441441

442442
# compare two matches in the sorted sequence: current and next
443443
i = 0
@@ -450,13 +450,14 @@ def merge_matches(matches, max_dist=MAX_DIST):
450450
if TRACE_MERGE: logger_debug('---> merge_matches: next: ', next_match)
451451

452452
# two exact matches can never be merged as they will not be overlapping
453-
if current_match.matcher != MATCH_SEQ and next_match.matcher != MATCH_SEQ:
454-
if TRACE_MERGE: logger_debug(' ---> ###merge_matches: both matches are EXACT_MATCHES, skipping')
455-
break
453+
# only sequence matches for the same rule can be merged
454+
#if current_match.matcher != MATCH_SEQ and next_match.matcher != MATCH_SEQ:
455+
# if TRACE_MERGE: logger_debug(' ---> ###merge_matches: both matches are EXACT_MATCHES, skipping')
456+
# break
456457

457458
# FIXME: also considers the match length!
458459
# stop if we exceed max dist
459-
# or distance over 1/5 of rule length
460+
# or distance over 1/2 of rule length
460461
if (current_match.qdistance_to(next_match) > max_rlen_dist
461462
or current_match.idistance_to(next_match) > max_rlen_dist):
462463
if TRACE_MERGE: logger_debug(' ---> ###merge_matches: MAX_DIST reached, breaking')

tests/licensedcode/test_match.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,11 +280,24 @@ def test_merge_does_merge_overlapping_matches_of_same_rules_if_in_sequence(self)
280280

281281
def test_merge_does_not_merge_overlapping_matches_of_same_rules_if_in_sequence_with_gaps(self):
282282
r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
283+
r1.length = 50
283284

284285
m1 = LicenseMatch(rule=r1, qspan=Span(1, 3), ispan=Span(1, 3))
285286
m2 = LicenseMatch(rule=r1, qspan=Span(14, 20), ispan=Span(4, 10))
286287

287-
assert [LicenseMatch(rule=r1, qspan=Span(1, 3) | Span(14, 20), ispan=Span(1, 10))] == merge_matches([m1, m2])
288+
expected = [LicenseMatch(rule=r1, qspan=Span(1, 3) | Span(14, 20), ispan=Span(1, 10))]
289+
results = merge_matches([m1, m2])
290+
assert expected == results
291+
292+
def test_merge_does_not_merge_overlapping_matches_of_same_rules_if_in_sequence_with_gaps_for_long_match(self):
293+
r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
294+
r1.length = 20
295+
m1 = LicenseMatch(rule=r1, qspan=Span(1, 10), ispan=Span(1, 10))
296+
m2 = LicenseMatch(rule=r1, qspan=Span(14, 20), ispan=Span(14, 20))
297+
298+
expected = [LicenseMatch(rule=r1, qspan=Span(1, 10) | Span(14, 20), ispan=Span(1, 10) | Span(14, 20))]
299+
results = merge_matches([m1, m2])
300+
assert expected == results
288301

289302
def test_merge_does_not_merge_overlapping_matches_of_same_rules_if_in_not_sequence(self):
290303
r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])

0 commit comments

Comments
 (0)