Skip to content

Commit b9807cc

Browse files
committed
Improve license match filtering and tracing #695
* refine looging and tracing * match_seq.py: pre-bind some function names in loops * match.py * refine match tracing * minor performance and clarification in match merging and filtering * ensure contained match filtering stop early * make merge optional in refine * add TODO and FIXME comments Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent e7989fa commit b9807cc

File tree

4 files changed

+12
-5
lines changed

4 files changed

+12
-5
lines changed

src/licensedcode/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@
5151
MIN_MATCH_LENGTH = 4
5252
MIN_MATCH_HIGH_LENGTH = 3
5353

54+
# FIXME: we should consider the length of two rules and two matches when considering MAX_DIST
55+
# eventually this should be skipped early right during the matching too
5456
# maximum distance between two matches to merge
5557
MAX_DIST = 120
5658

src/licensedcode/index.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -458,7 +458,7 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
458458
exact_matches = match_aho.exact_match(self, whole_query_run, self.rules_automaton)
459459
if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: EXACT matches#:', location, query_string)
460460

461-
exact_matches, exact_discarded = match.refine_matches(exact_matches, self, query=qry, filter_false_positive=False)
461+
exact_matches, exact_discarded = match.refine_matches(exact_matches, self, query=qry, filter_false_positive=False, merge=False)
462462

463463
if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: ===> exact matches refined')
464464
if TRACE_EXACT: self.debug_matches(exact_discarded, ' #match: ===> exact matches discarded')
@@ -504,18 +504,15 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
504504
run_matches = []
505505
candidates = match_set.compute_candidates(query_run, self, rules_subset=rules_subset, top=40)
506506

507-
if TRACE: logger_debug(' #match: query_run: number of candidates for seq match #', len(candidates))
508507
if TRACE_CANDIDATES: logger_debug(' #match: query_run: number of candidates for seq match #', len(candidates))
509508

510509
for candidate_num, candidate in enumerate(candidates):
511-
if TRACE: logger_debug(' #match: query_run: seq matching candidate#:', candidate_num, 'candidate:', candidate[0], candidate[1])
512510
if TRACE_QUERY_RUN:
513511
_, canrule, _ = candidate
514512
logger_debug(' #match: query_run: seq matching candidate#:', candidate_num, 'candidate:', canrule)
515513
start_offset = 0
516514
while True:
517515
rule_matches = match_seq.match_sequence(self, candidate, query_run, start_offset=start_offset)
518-
if TRACE and rule_matches: self.debug_matches(rule_matches, ' #match: query_run: seq matches for candidate')
519516
if TRACE_QUERY_RUN and rule_matches:
520517
self.debug_matches(rule_matches, ' #match: query_run: seq matches for candidate', with_text=True, query=qry)
521518
if not rule_matches:

src/licensedcode/match.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -902,7 +902,7 @@ def filter_false_positive_matches(matches):
902902
return kept, discarded
903903

904904

905-
def refine_matches(matches, idx, query=None, min_score=0, max_dist=MAX_DIST, filter_false_positive=True):
905+
def refine_matches(matches, idx, query=None, min_score=0, max_dist=MAX_DIST, filter_false_positive=True, merge=True):
906906
"""
907907
Return two sequences of matches: one contains refined good matches, and the
908908
other contains matches that were filtered out.

src/licensedcode/match_set.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,15 @@ def index_token_sets(token_ids, len_junk, len_good):
218218

219219
CandidateData = namedtuple('CandidateData', 'intersection distance matched_length high_inter_len low_inter_len')
220220

221+
# FIXME: we should consider existing aho matches when considering candidate
222+
# and not rematch these at all
221223

224+
# FIXME: we should consider more aggressively the thresholds and what a match filters
225+
# would discard when we compute candaites to eventually discard many or all candidates
226+
# we compute too many candidates that may waste time in seq matching for no reason
227+
228+
# FIXME: Also we should remove any weak and or small rules from the top candidates
229+
# and anything that cannot be seq matched at all. (e.g. no high match)
222230
def compute_candidates(query_run, idx, rules_subset, top=30):
223231
"""
224232
Return a ranked list of rule candidates for further matching as a tuple of:

0 commit comments

Comments
 (0)