aboutcode-org
diff --git a/‎src/licensedcode/index.py‎
Lines changed: 23 additions & 38 deletions b/‎src/licensedcode/index.py‎
Lines changed: 23 additions & 38 deletions
diff --git a/‎src/licensedcode/match.py‎
Lines changed: 21 additions & 40 deletions b/‎src/licensedcode/match.py‎
Lines changed: 21 additions & 40 deletions
@@ -155,8 +155,6 @@ class LicenseIndex(object):
         'small_rids',
         'negative_rids',
         'false_positive_rids',
-
-        'false_positive_rid_by_hash',
         'largest_false_positive_length',
 
         'optimized',
@@ -202,7 +200,6 @@ def __init__(self, rules=None, _ranked_tokens=global_tokens_by_ranks):
         # (low_tids_mset, high_tids_mset)
         self.tids_msets_by_rid = []
 
-        # ---
         # mapping of hash -> single rid : duplicated rules are not allowed
         self.rid_by_hash = {}
 
@@ -218,8 +215,6 @@ def __init__(self, rules=None, _ranked_tokens=global_tokens_by_ranks):
 
         # length of the largest false_positive rule
         self.largest_false_positive_length = 0
-        # mapping of hash -> rid for false positive rule tokens hashes
-        self.false_positive_rid_by_hash = {}
 
         # if True the index has been optimized and becomes read only:
         # no new rules can be added
@@ -274,7 +269,7 @@ def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks):
                 self.false_positive_rids.add(rid)
                 if rul_len > self.largest_false_positive_length:
                     self.largest_false_positive_length = rul_len
-            elif rul.negative():
+            elif rul.negative:
                 # negative rules are matched early and their exactly matched
                 # tokens are removed from the token stream
                 self.negative_rids.add(rid)
@@ -335,13 +330,10 @@ def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks):
             rule_hash = match_hash.index_hash(rule_token_ids)
             dupe_rules_by_hash[rule_hash].append(rule)
 
-            if rule.false_positive:
-                # FP rules are not used for any matching
-                # there is nothing else for these rules
-                self.false_positive_rid_by_hash[rule_hash] = rid
-            else:
-                # negative, small and regular
+            if rule.negative:
+                negative_automaton_add(tids=rule_token_ids, rid=rid)
 
+            else:
                 # update hashes index
                 self.rid_by_hash[rule_hash] = rid
 
@@ -362,19 +354,14 @@ def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks):
                 self.tids_sets_by_rid[rid] = rlow_set, rhigh_set
                 self.tids_msets_by_rid[rid] = rlow_mset, rhigh_mset
 
-                # populate automatons...
-                if rule.negative():
-                    # ... with only the whole rule tokens sequence
-                    negative_automaton_add(tids=rule_token_ids, rid=rid)
-                else:
-                    # ... or with the whole rule tokens sequence
-                    rules_automaton_add(tids=rule_token_ids, rid=rid)
-                    # ... and ngrams: compute ngrams and populate the automaton with ngrams
-                    if USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len(rule_token_ids) > NGRAM_LEN:
-                        all_ngrams = tokenize.ngrams(rule_token_ids, ngram_length=NGRAM_LEN)
-                        selected_ngrams = tokenize.select_ngrams(all_ngrams, with_pos=True)
-                        for pos, ngram in selected_ngrams:
-                            rules_automaton_add(tids=ngram, rid=rid, start=pos)
+                # populate automaton with the whole rule tokens sequence
+                rules_automaton_add(tids=rule_token_ids, rid=rid)
+                # ... and ngrams: compute ngrams and populate the automaton with ngrams
+                if USE_AHO_FRAGMENTS and rule.minimum_coverage < 100 and len(rule_token_ids) > NGRAM_LEN:
+                    all_ngrams = tokenize.ngrams(rule_token_ids, ngram_length=NGRAM_LEN)
+                    selected_ngrams = tokenize.select_ngrams(all_ngrams, with_pos=True)
+                    for pos, ngram in selected_ngrams:
+                        rules_automaton_add(tids=ngram, rid=rid, start=pos)
 
                 # update rule thresholds
                 rule.low_unique = match_set.tids_set_counter(rlow_set)
@@ -390,7 +377,6 @@ def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks):
 
         # sparser dicts for faster lookup
         sparsify(self.rid_by_hash)
-        sparsify(self.false_positive_rid_by_hash)
 
         dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1]
         if dupe_rules:
@@ -457,22 +443,22 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
             return hash_matches
 
         # negative rules exact matching
-        negative = []
+        negative_matches = []
         # note: detect_negative is false only to test negative rules detection proper
         if detect_negative and self.negative_rids:
             if TRACE: logger_debug('#match: NEGATIVE')
-            negative = self.negative_match(whole_query_run)
-            for neg in negative:
-                if TRACE_NEGATIVE: self.debug_matches(negative, '   ##match: NEGATIVE subtracting #:', location, query_string)
+            negative_matches = self.negative_match(whole_query_run)
+            for neg in negative_matches:
+                if TRACE_NEGATIVE: self.debug_matches(negative_matches, '   ##match: NEGATIVE subtracting #:', location, query_string)
                 whole_query_run.subtract(neg.qspan)
-            if TRACE_NEGATIVE: logger_debug('     #match: NEGATIVE found', negative)
+            if TRACE_NEGATIVE: logger_debug('     #match: NEGATIVE found', negative_matches)
 
         # exact matches
         if TRACE_EXACT: logger_debug('#match: EXACT')
         exact_matches = match_aho.exact_match(self, whole_query_run, self.rules_automaton)
         if TRACE_EXACT: self.debug_matches(exact_matches, '  #match: EXACT matches#:', location, query_string)
 
-        exact_matches, exact_discarded = match.refine_matches(exact_matches, self, query=qry)
+        exact_matches, exact_discarded = match.refine_matches(exact_matches, self, query=qry, filter_false_positive=False)
 
         if TRACE_EXACT: self.debug_matches(exact_matches, '   #match: ===> exact matches refined')
         if TRACE_EXACT: self.debug_matches(exact_discarded, '   #match: ===> exact matches discarded')
@@ -527,7 +513,7 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
                     start_offset = 0
                     while True:
                         rule_matches = match_seq.match_sequence(self, candidate, query_run, start_offset=start_offset)
-                        if TRACE_QUERY_RUN and rule_matches: 
+                        if TRACE_QUERY_RUN and rule_matches:
                             self.debug_matches(rule_matches, '           #match: query_run: seq matches for candidate', with_text=True, query=qry)
                         if not rule_matches:
                             break
@@ -556,7 +542,7 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
             logger_debug('!!!!!!!!!!!!!!!!!!!!REFINING!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
             self.debug_matches(matches, '#match: ALL matches from all query runs', location, query_string)
 
-            matches, whole_discarded = match.refine_matches(matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2)
+            matches, whole_discarded = match.refine_matches(matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2, filter_false_positive=True)
             if TRACE_MATCHES_DISCARD:
                 discarded.extend(whole_discarded)
             matches.sort()
@@ -570,8 +556,9 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
 
     def negative_match(self, query_run):
         """
-        Match a query run exactly against negative, license-less rules.
-        Return a list of negative LicenseMatch for a query run, subtract these matches from the query run.
+        Match a query run exactly against negative rules. Return a list
+        of negative LicenseMatch for a query run, subtract these matches
+        from the query run.
         """
         matches = match_aho.exact_match(self, query_run, self.negative_automaton)
 
@@ -604,8 +591,6 @@ def _print_index_stats(self):
         'negative_rids',
         'small_rids',
         'false_positive_rids',
-
-        'false_positive_rid_by_hash',
         ]
 
         plen = max(map(len, fields)) + 1
 
@@ -23,21 +23,19 @@
 #  Visit https://github.com/nexB/scancode-toolkit/ for support and download.
 
 from __future__ import absolute_import
-from __future__ import print_function
 from __future__ import division
+from __future__ import print_function
 
-from array import array
-from functools import partial
-from functools import total_ordering
-from hashlib import md5
 from itertools import chain
 from itertools import groupby
+from functools import partial
+from functools import total_ordering
 import textwrap
 
-from licensedcode import query
-from licensedcode.spans import Span
 from licensedcode import MAX_DIST
+from licensedcode import query
 from licensedcode import tokenize
+from licensedcode.spans import Span
 
 """
 LicenseMatch data structure and matches merging and filtering routines.
@@ -381,23 +379,6 @@ def small(self):
 
         return False
 
-    def false_positive(self, idx):
-        """
-        Return a True-ish (e.g. a false positive rule id) if the LicenseMatch match
-        is a false positive or None otherwise (nb: not False). This is done by a
-        lookup of the matched tokens sequence against the `idx` index false positive
-        rules.
-        """
-        ilen = self.ilen()
-        if ilen > idx.largest_false_positive_length:
-            return
-        rule_tokens = idx.tids_by_rid[self.rule.rid]
-        ispan = self.ispan
-        matched_itokens = array('h', (tid for ipos, tid in enumerate(rule_tokens) if ipos in ispan))
-        # note: hash computation is inlined here but MUST be the same code as in match_hash
-        matched_hash = md5(matched_itokens.tostring()).digest()
-        return idx.false_positive_rid_by_hash.get(matched_hash)
-
     def matched_text(self, whole_lines=False, 
                      highlight_matched=u'%s', highlight_not_matched=u'[%s]'):
         """
@@ -904,25 +885,24 @@ def filter_spurious_matches(matches):
     return kept, discarded
 
 
-def filter_false_positive_matches(matches, idx):
+def filter_false_positive_matches(matches):
     """
-    Return a list of matches that are not false positives and a list of false
-    positive matches given an index `idx`.
+    Return a list of matches that are not false positives and a list of
+    false positive matches.
     """
     kept = []
     discarded = []
     for match in matches:
-        fp = match.false_positive(idx)
-        if fp is None:
+        if match.rule.false_positive:
+            if TRACE_REFINE: logger_debug('    ==> DISCARDING FALSE POSITIVE:', match)
+            discarded.append(match)
+        else:
             # if TRACE_REFINE: logger_debug('    ==> NOT DISCARDING FALSE POSITIVE:', match)
             kept.append(match)
-        else:
-            if TRACE_REFINE: logger_debug('    ==> DISCARDING FALSE POSITIVE:', match, 'fp rule:', idx.rules_by_rid[fp].identifier)
-            discarded.append(match)
     return kept, discarded
 
 
-def refine_matches(matches, idx, query=None, min_score=0, max_dist=MAX_DIST):
+def refine_matches(matches, idx, query=None, min_score=0, max_dist=MAX_DIST, filter_false_positive=True):
     """
     Return two sequences of matches: one contains refined good matches, and the
     other contains matches that were filtered out.
@@ -957,13 +937,6 @@ def refine_matches(matches, idx, query=None, min_score=0, max_dist=MAX_DIST):
     if TRACE: logger_debug('   #####refine_matches: SHORT discarded#', len(discarded))
     if TRACE_REFINE: map(logger_debug, discarded)
 
-    matches, discarded = filter_false_positive_matches(matches, idx)
-    all_discarded.extend(discarded)
-    if TRACE: logger_debug('   #####refine_matches: NOT FALSE POS #', len(matches))
-    if TRACE_REFINE: map(logger_debug, matches)
-    if TRACE: logger_debug('   #####refine_matches: FALSE POS discarded#', len(discarded))
-    if TRACE_REFINE: map(logger_debug, discarded)
-
     matches, discarded = filter_spurious_matches(matches)
     all_discarded.extend(discarded)
     if TRACE: logger_debug('   #####refine_matches: NOT SPURIOUS#', len(matches))
@@ -982,6 +955,14 @@ def refine_matches(matches, idx, query=None, min_score=0, max_dist=MAX_DIST):
     if TRACE: logger_debug('   #####refine_matches: FILTERED discarded#', len(discarded))
     if TRACE_REFINE: map(logger_debug, discarded)
 
+    if filter_false_positive:
+        matches, discarded = filter_false_positive_matches(matches)
+        all_discarded.extend(discarded)
+        if TRACE: logger_debug('   #####refine_matches: NOT FALSE POS #', len(matches))
+        if TRACE_REFINE: map(logger_debug, matches)
+        if TRACE: logger_debug('   #####refine_matches: FALSE POS discarded#', len(discarded))
+        if TRACE_REFINE: map(logger_debug, discarded)
+
     if min_score:
         matches, discarded = filter_low_score(matches, min_score=min_score)
         all_discarded.extend(discarded)