@@ -163,9 +163,11 @@ def __init__(
163163 ):
164164 """
165165 Initialize the index with an iterable of Rule objects.
166- ``_legalese`` is a set of common license-specific words aka. legalese
166+ ``_legalese`` is a sorted mapping of common license-specific words aka. legalese as {token: id}
167167 ``_spdx_tokens`` is a set of tokens used in SPDX license identifiers
168- ``license_tokens`` is a set of "license" tokens used as start or end of a rule
168+ ``_license_tokens`` is a set of "license" tokens used as start or end of a rule
169+ If ``_all_languages`` is True, use all spoken languages license and rules.
170+ Otherwise, use only English rules and licenses.
169171 """
170172 # total number of unique known tokens
171173 self .len_tokens = 0
@@ -267,9 +269,9 @@ def _add_rules(
267269 Add a list of Rule objects to the index and constructs optimized and
268270 immutable index structures.
269271
270- `_legalese` is a set of common license-specific words aka. legalese
271- `_spdx_tokens` is a set of token strings used in SPDX license identifiers
272- ``license_tokens `` is a set of "license" tokens used as start or end of a rule
272+ `` _legalese`` is a sorted mapping of common license-specific words aka. legalese as {token: id}
273+ `` _spdx_tokens` ` is a set of token strings used in SPDX license identifiers
274+ ``_license_tokens `` is a set of "license" tokens used as start or end of a rule
273275 """
274276 if self .optimized :
275277 raise Exception ('Index has been optimized and cannot be updated.' )
@@ -281,10 +283,7 @@ def _add_rules(
281283 # valid "unichr" values, making it easier downstream when used in
282284 # automatons
283285
284- self .dictionary = dictionary = {
285- ts : tid for tid , ts in enumerate (sorted (_legalese ))
286- }
287-
286+ self .dictionary = dictionary = dict (_legalese )
288287 dictionary_get = dictionary .get
289288
290289 self .len_legalese = len_legalese = len (dictionary )
@@ -385,7 +384,7 @@ def _add_rules(
385384
386385 # A rule is weak if it does not contain at least one legalese word:
387386 # we consider all rules to be weak until proven otherwise below.
388- # "weak" rules can only be matched with an automaton.
387+ # "weak" rules can only be matched with an automaton exactly .
389388 is_weak = True
390389
391390 for rts in rule .tokens ():
@@ -400,7 +399,10 @@ def _add_rules(
400399 if is_weak and rtid < len_legalese :
401400 is_weak = False
402401
403- rule_token_ids_append (rtid )
402+ try :
403+ rule_token_ids_append (rtid )
404+ except Exception as e :
405+ raise Exception (rtid , rts , rule ) from e
404406
405407 rule_length = rule .length
406408 is_tiny = rule_length < TINY_RULE
@@ -564,21 +566,29 @@ def _add_rules(
564566 msg = 'Cannot support more than licensedcode.index.MAX_TOKENS: %d' % MAX_TOKENS
565567 assert len_tokens <= MAX_TOKENS , msg
566568
567- dupe_rules = [rules for rules in dupe_rules_by_hash . values () if len ( rules ) > 1 ]
568- if dupe_rules :
569- dupe_rule_paths = []
570- for rules in dupe_rules :
571- drp = [rule .identifier for rule in rules ]
572- drp .sort ()
573- dupe_rule_paths .append ('\n ' .join (drp ))
569+ dupe_rule_paths = []
570+ for rules in dupe_rules_by_hash . values () :
571+ if len ( rules ) == 1 :
572+ continue
573+ drp = [rule .identifier for rule in rules ]
574+ drp .sort ()
575+ dupe_rule_paths .append ('\n ' .join (drp ))
574576
577+ if dupe_rule_paths :
575578 msg = ('Duplicate rules: \n ' + '\n \n ' .join (dupe_rule_paths ))
576579 raise DuplicateRuleError (msg )
577580
578581 self .optimized = True
579582
580- def debug_matches (self , matches , message , location = None , query_string = None ,
581- with_text = False , qry = None ):
583+ def debug_matches (
584+ self ,
585+ matches ,
586+ message ,
587+ location = None ,
588+ query_string = None ,
589+ with_text = False ,
590+ qry = None ,
591+ ):
582592 """
583593 Log debug-level data for a list of `matches`.
584594 """
0 commit comments