@@ -155,8 +155,6 @@ class LicenseIndex(object):
155155 'small_rids' ,
156156 'negative_rids' ,
157157 'false_positive_rids' ,
158-
159- 'false_positive_rid_by_hash' ,
160158 'largest_false_positive_length' ,
161159
162160 'optimized' ,
@@ -202,7 +200,6 @@ def __init__(self, rules=None, _ranked_tokens=global_tokens_by_ranks):
202200 # (low_tids_mset, high_tids_mset)
203201 self .tids_msets_by_rid = []
204202
205- # ---
206203 # mapping of hash -> single rid : duplicated rules are not allowed
207204 self .rid_by_hash = {}
208205
@@ -218,8 +215,6 @@ def __init__(self, rules=None, _ranked_tokens=global_tokens_by_ranks):
218215
219216 # length of the largest false_positive rule
220217 self .largest_false_positive_length = 0
221- # mapping of hash -> rid for false positive rule tokens hashes
222- self .false_positive_rid_by_hash = {}
223218
224219 # if True the index has been optimized and becomes read only:
225220 # no new rules can be added
@@ -274,7 +269,7 @@ def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks):
274269 self .false_positive_rids .add (rid )
275270 if rul_len > self .largest_false_positive_length :
276271 self .largest_false_positive_length = rul_len
277- elif rul .negative () :
272+ elif rul .negative :
278273 # negative rules are matched early and their exactly matched
279274 # tokens are removed from the token stream
280275 self .negative_rids .add (rid )
@@ -335,13 +330,10 @@ def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks):
335330 rule_hash = match_hash .index_hash (rule_token_ids )
336331 dupe_rules_by_hash [rule_hash ].append (rule )
337332
338- if rule .false_positive :
339- # FP rules are not used for any matching
340- # there is nothing else for these rules
341- self .false_positive_rid_by_hash [rule_hash ] = rid
342- else :
343- # negative, small and regular
333+ if rule .negative :
334+ negative_automaton_add (tids = rule_token_ids , rid = rid )
344335
336+ else :
345337 # update hashes index
346338 self .rid_by_hash [rule_hash ] = rid
347339
@@ -362,19 +354,14 @@ def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks):
362354 self .tids_sets_by_rid [rid ] = rlow_set , rhigh_set
363355 self .tids_msets_by_rid [rid ] = rlow_mset , rhigh_mset
364356
365- # populate automatons...
366- if rule .negative ():
367- # ... with only the whole rule tokens sequence
368- negative_automaton_add (tids = rule_token_ids , rid = rid )
369- else :
370- # ... or with the whole rule tokens sequence
371- rules_automaton_add (tids = rule_token_ids , rid = rid )
372- # ... and ngrams: compute ngrams and populate the automaton with ngrams
373- if USE_AHO_FRAGMENTS and rule .minimum_coverage < 100 and len (rule_token_ids ) > NGRAM_LEN :
374- all_ngrams = tokenize .ngrams (rule_token_ids , ngram_length = NGRAM_LEN )
375- selected_ngrams = tokenize .select_ngrams (all_ngrams , with_pos = True )
376- for pos , ngram in selected_ngrams :
377- rules_automaton_add (tids = ngram , rid = rid , start = pos )
357+ # populate automaton with the whole rule tokens sequence
358+ rules_automaton_add (tids = rule_token_ids , rid = rid )
359+ # ... and ngrams: compute ngrams and populate the automaton with ngrams
360+ if USE_AHO_FRAGMENTS and rule .minimum_coverage < 100 and len (rule_token_ids ) > NGRAM_LEN :
361+ all_ngrams = tokenize .ngrams (rule_token_ids , ngram_length = NGRAM_LEN )
362+ selected_ngrams = tokenize .select_ngrams (all_ngrams , with_pos = True )
363+ for pos , ngram in selected_ngrams :
364+ rules_automaton_add (tids = ngram , rid = rid , start = pos )
378365
379366 # update rule thresholds
380367 rule .low_unique = match_set .tids_set_counter (rlow_set )
@@ -390,7 +377,6 @@ def _add_rules(self, rules, _ranked_tokens=global_tokens_by_ranks):
390377
391378 # sparser dicts for faster lookup
392379 sparsify (self .rid_by_hash )
393- sparsify (self .false_positive_rid_by_hash )
394380
395381 dupe_rules = [rules for rules in dupe_rules_by_hash .values () if len (rules ) > 1 ]
396382 if dupe_rules :
@@ -457,22 +443,22 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
457443 return hash_matches
458444
459445 # negative rules exact matching
460- negative = []
446+ negative_matches = []
461447 # note: detect_negative is false only to test negative rules detection proper
462448 if detect_negative and self .negative_rids :
463449 if TRACE : logger_debug ('#match: NEGATIVE' )
464- negative = self .negative_match (whole_query_run )
465- for neg in negative :
466- if TRACE_NEGATIVE : self .debug_matches (negative , ' ##match: NEGATIVE subtracting #:' , location , query_string )
450+ negative_matches = self .negative_match (whole_query_run )
451+ for neg in negative_matches :
452+ if TRACE_NEGATIVE : self .debug_matches (negative_matches , ' ##match: NEGATIVE subtracting #:' , location , query_string )
467453 whole_query_run .subtract (neg .qspan )
468- if TRACE_NEGATIVE : logger_debug (' #match: NEGATIVE found' , negative )
454+ if TRACE_NEGATIVE : logger_debug (' #match: NEGATIVE found' , negative_matches )
469455
470456 # exact matches
471457 if TRACE_EXACT : logger_debug ('#match: EXACT' )
472458 exact_matches = match_aho .exact_match (self , whole_query_run , self .rules_automaton )
473459 if TRACE_EXACT : self .debug_matches (exact_matches , ' #match: EXACT matches#:' , location , query_string )
474460
475- exact_matches , exact_discarded = match .refine_matches (exact_matches , self , query = qry )
461+ exact_matches , exact_discarded = match .refine_matches (exact_matches , self , query = qry , filter_false_positive = False )
476462
477463 if TRACE_EXACT : self .debug_matches (exact_matches , ' #match: ===> exact matches refined' )
478464 if TRACE_EXACT : self .debug_matches (exact_discarded , ' #match: ===> exact matches discarded' )
@@ -527,7 +513,7 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
527513 start_offset = 0
528514 while True :
529515 rule_matches = match_seq .match_sequence (self , candidate , query_run , start_offset = start_offset )
530- if TRACE_QUERY_RUN and rule_matches :
516+ if TRACE_QUERY_RUN and rule_matches :
531517 self .debug_matches (rule_matches , ' #match: query_run: seq matches for candidate' , with_text = True , query = qry )
532518 if not rule_matches :
533519 break
@@ -556,7 +542,7 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
556542 logger_debug ('!!!!!!!!!!!!!!!!!!!!REFINING!!!!!!!!!!!!!!!!!!!!!!!!!!!!' )
557543 self .debug_matches (matches , '#match: ALL matches from all query runs' , location , query_string )
558544
559- matches , whole_discarded = match .refine_matches (matches , idx = self , query = qry , min_score = min_score , max_dist = MAX_DIST // 2 )
545+ matches , whole_discarded = match .refine_matches (matches , idx = self , query = qry , min_score = min_score , max_dist = MAX_DIST // 2 , filter_false_positive = True )
560546 if TRACE_MATCHES_DISCARD :
561547 discarded .extend (whole_discarded )
562548 matches .sort ()
@@ -570,8 +556,9 @@ def match(self, location=None, query_string=None, min_score=0, detect_negative=T
570556
571557 def negative_match (self , query_run ):
572558 """
573- Match a query run exactly against negative, license-less rules.
574- Return a list of negative LicenseMatch for a query run, subtract these matches from the query run.
559+ Match a query run exactly against negative rules. Return a list
560+ of negative LicenseMatch for a query run, subtract these matches
561+ from the query run.
575562 """
576563 matches = match_aho .exact_match (self , query_run , self .negative_automaton )
577564
@@ -604,8 +591,6 @@ def _print_index_stats(self):
604591 'negative_rids' ,
605592 'small_rids' ,
606593 'false_positive_rids' ,
607-
608- 'false_positive_rid_by_hash' ,
609594 ]
610595
611596 plen = max (map (len , fields )) + 1
0 commit comments