Improve license match filtering and tracing #695

pombredanne · pombredanne · commit b9807ccfa6c9 · 2017-10-02T16:30:13.000-07:00
* refine looging and tracing
 * match_seq.py: pre-bind some function names in loops
 * match.py
  * refine match tracing
  * minor performance and clarification in match merging and filtering
  * ensure contained match filtering stop early
  * make merge optional in refine
  * add TODO and FIXME comments

Signed-off-by: Philippe Ombredanne &lt;pombredanne@nexb.com&gt;
diff --git a/src/licensedcode/__init__.py b/src/licensedcode/__init__.py
@@ -51,6 +51,8 @@
 MIN_MATCH_LENGTH = 4
 MIN_MATCH_HIGH_LENGTH = 3
 
+# FIXME: we should consider the length of two rules and two matches when considering MAX_DIST
+# eventually this should be skipped early right during the matching too
 # maximum distance between two matches to merge
 MAX_DIST = 120
 
diff --git a/src/licensedcode/index.py b/src/licensedcode/index.py
@@ -458,7 +458,7 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
         exact_matches = match_aho.exact_match(self, whole_query_run, self.rules_automaton)
         if TRACE_EXACT: self.debug_matches(exact_matches, '  #match: EXACT matches#:', location, query_string)
 
-        exact_matches, exact_discarded = match.refine_matches(exact_matches, self, query=qry, filter_false_positive=False)
+        exact_matches, exact_discarded = match.refine_matches(exact_matches, self, query=qry, filter_false_positive=False, merge=False)
 
         if TRACE_EXACT: self.debug_matches(exact_matches, '   #match: ===> exact matches refined')
         if TRACE_EXACT: self.debug_matches(exact_discarded, '   #match: ===> exact matches discarded')
@@ -504,18 +504,15 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
                 run_matches = []
                 candidates = match_set.compute_candidates(query_run, self, rules_subset=rules_subset, top=40)
 
-                if TRACE: logger_debug('      #match: query_run: number of candidates for seq match #', len(candidates))
                 if TRACE_CANDIDATES: logger_debug('      #match: query_run: number of candidates for seq match #', len(candidates))
 
                 for candidate_num, candidate in enumerate(candidates):
-                    if TRACE: logger_debug('         #match: query_run: seq matching candidate#:', candidate_num, 'candidate:', candidate[0], candidate[1])
                     if TRACE_QUERY_RUN:
                         _, canrule, _ = candidate
                         logger_debug('         #match: query_run: seq matching candidate#:', candidate_num, 'candidate:', canrule)
                     start_offset = 0
                     while True:
                         rule_matches = match_seq.match_sequence(self, candidate, query_run, start_offset=start_offset)
-                        if TRACE and rule_matches: self.debug_matches(rule_matches, '           #match: query_run: seq matches for candidate')
                         if TRACE_QUERY_RUN and rule_matches:
                             self.debug_matches(rule_matches, '           #match: query_run: seq matches for candidate', with_text=True, query=qry)
                         if not rule_matches:
diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py
@@ -902,7 +902,7 @@ def filter_false_positive_matches(matches):
     return kept, discarded
 
 
-def refine_matches(matches, idx, query=None, min_score=0, max_dist=MAX_DIST, filter_false_positive=True):
+def refine_matches(matches, idx, query=None, min_score=0, max_dist=MAX_DIST, filter_false_positive=True, merge=True):
     """
     Return two sequences of matches: one contains refined good matches, and the
     other contains matches that were filtered out.
diff --git a/src/licensedcode/match_set.py b/src/licensedcode/match_set.py
@@ -218,7 +218,15 @@ def index_token_sets(token_ids, len_junk, len_good):
 
 CandidateData = namedtuple('CandidateData', 'intersection distance matched_length high_inter_len low_inter_len')
 
+# FIXME: we should consider existing aho matches when considering candidate
+# and not rematch these at all
 
+# FIXME: we should consider more aggressively the thresholds and what a match filters
+# would discard when we compute candaites to eventually discard many or all candidates
+# we compute too many candidates that may waste time in seq matching for no reason
+
+# FIXME: Also we should remove any weak and or small rules from the top candidates
+# and anything that cannot be seq matched at all. (e.g. no high match)
 def compute_candidates(query_run, idx, rules_subset, top=30):
     """
     Return a ranked list of rule candidates for further matching as a tuple of: