@@ -1425,13 +1425,33 @@ class BasicRule:
14251425 'Mutually exclusive from any is_license_* flag' )
14261426 )
14271427
1428+ is_required_phrase = attr .ib (
1429+ default = False ,
1430+ repr = False ,
1431+ metadata = dict (
1432+ help = 'True if this is rule text is a required phrase '
1433+ 'A required phrase is often a part of another larger rule text '
1434+ 'but is an essential section of the rule text which must be '
1435+ 'present in the case of partial matches, otherwise the match '
1436+ 'will be a false positive and misleading. ' )
1437+ )
1438+
1439+ skip_creating_required_phrases = attr .ib (
1440+ default = False ,
1441+ repr = False ,
1442+ metadata = dict (
1443+ help = 'True if this rule needs to be skipped while creating '
1444+ 'required phrase rules. Required phrase rules are created out '
1445+ 'of other rule texts which have marked required phrases. ' )
1446+ )
1447+
14281448 language = attr .ib (
14291449 default = 'en' ,
14301450 repr = False ,
14311451 metadata = dict (
14321452 help = 'Two-letter ISO 639-1 language code if this license text is '
14331453 'not in English. See https://en.wikipedia.org/wiki/ISO_639-1 .' )
1434- )
1454+ )
14351455
14361456 minimum_coverage = attr .ib (
14371457 default = 0 ,
@@ -1793,22 +1813,27 @@ def has_unknown(self):
17931813 # license flag instead
17941814 return self .license_expression and 'unknown' in self .license_expression
17951815
1796- def validate (self , licensing = None , thorough = False ):
1797- """
1798- Validate this rule using the provided ``licensing`` Licensing and yield
1799- one error message for each type of error detected.
1800- """
1801- is_false_positive = self .is_false_positive
1802-
1803- license_flags = (
1816+ @property
1817+ def license_flags (self ):
1818+ return (
18041819 self .is_license_notice ,
18051820 self .is_license_text ,
18061821 self .is_license_reference ,
18071822 self .is_license_tag ,
18081823 self .is_license_intro ,
18091824 self .is_license_clue ,
1825+ self .is_required_phrase ,
18101826 )
18111827
1828+ def validate (self , licensing = None , thorough = False ):
1829+ """
1830+ Validate this rule using the provided ``licensing`` Licensing and yield
1831+ one error message for each type of error detected.
1832+ """
1833+ is_false_positive = self .is_false_positive
1834+
1835+ license_flags = self .license_flags
1836+
18121837 has_license_flags = any (license_flags )
18131838 has_many_license_flags = len ([l for l in license_flags if l ]) != 1
18141839
@@ -1961,6 +1986,7 @@ def get_flags_mapping(self):
19611986 'is_license_tag' ,
19621987 'is_license_intro' ,
19631988 'is_license_clue' ,
1989+ 'is_required_phrase' ,
19641990 'is_continuous' ,
19651991 ]
19661992
@@ -1987,6 +2013,8 @@ def to_reference(self):
19872013 data ['is_license_tag' ] = self .is_license_tag
19882014 data ['is_license_intro' ] = self .is_license_intro
19892015 data ['is_license_clue' ] = self .is_license_clue
2016+ data ['is_required_phrase' ] = self .is_required_phrase
2017+ data ['skip_creating_required_phrases' ] = self .skip_creating_required_phrases
19902018 data ['is_continuous' ] = self .is_continuous
19912019 data ['is_builtin' ] = self .is_builtin
19922020 data ['is_from_license' ] = self .is_from_license
@@ -2019,13 +2047,15 @@ def to_dict(self, include_text=False):
20192047
20202048 flags = (
20212049 'is_false_positive' ,
2050+ 'is_required_phrase' ,
20222051 'is_license_text' ,
20232052 'is_license_notice' ,
20242053 'is_license_reference' ,
20252054 'is_license_tag' ,
20262055 'is_license_intro' ,
20272056 'is_license_clue' ,
20282057 'is_continuous' ,
2058+ 'skip_creating_required_phrases' ,
20292059 'is_deprecated'
20302060 )
20312061
@@ -2206,11 +2236,11 @@ def build_key_phrase_spans(self):
22062236 Return a list of Spans marking key phrases token positions of that must
22072237 be present for this rule to be matched.
22082238 """
2209- from licensedcode .required_phrases import get_key_phrase_spans
2239+ from licensedcode .required_phrases import get_key_phrase_spans_or_tokens
22102240 if self .is_from_license :
22112241 return []
22122242 try :
2213- return list (get_key_phrase_spans (self .text ))
2243+ return list (get_key_phrase_spans_or_tokens (self .text ))
22142244 except Exception as e :
22152245 raise InvalidRule (f'Invalid rule: { self } ' ) from e
22162246
@@ -2241,7 +2271,7 @@ def compute_thresholds(self, small_rule=SMALL_RULE):
22412271
22422272 self .is_small = self .length < small_rule
22432273
2244- def dump (self , rules_data_dir ):
2274+ def dump (self , rules_data_dir , ** kwargs ):
22452275 """
22462276 Dump a representation of this rule as a .RULE file stored in
22472277 ``rules_data_dir`` as a UTF-8 file having:
@@ -2258,6 +2288,8 @@ def dump(self, rules_data_dir):
22582288 rule_file = self .rule_file (rules_data_dir = rules_data_dir )
22592289
22602290 metadata = self .to_dict ()
2291+ if kwargs :
2292+ metadata .update (kwargs )
22612293 content = self .text
22622294 output = dumps_frontmatter (content = content , metadata = metadata )
22632295 with open (rule_file , 'w' ) as of :
@@ -2300,6 +2332,8 @@ def load(self, rule_file, with_checks=True):
23002332 self .license_expression = data .get ('license_expression' )
23012333
23022334 self .is_false_positive = data .get ('is_false_positive' , False )
2335+ self .is_required_phrase = data .get ('is_required_phrase' , False )
2336+ self .skip_creating_required_phrases = data .get ('skip_creating_required_phrases' , False )
23032337
23042338 relevance = as_int (float (data .get ('relevance' ) or 0 ))
23052339 # Keep track if we have a stored relevance of not.
@@ -2359,7 +2393,7 @@ def set_relevance(self):
23592393 - relevance is computed based on the rule length
23602394 """
23612395
2362- if self .is_false_positive :
2396+ if self .is_false_positive or self . is_required_phrase :
23632397 self .relevance = 100
23642398 self .has_stored_relevance = True
23652399 return
@@ -2814,6 +2848,26 @@ def build_ignorables_mapping(copyrights, holders, authors, urls, emails):
28142848 return {k : v for k , v in sorted (ignorables .items ()) if v }
28152849
28162850
2851+ def rule_exists (text ):
2852+ """
2853+ Return the matched rule if the text is an existing rule matched
2854+ exactly, False otherwise.
2855+ """
2856+ from licensedcode .match_hash import MATCH_HASH
2857+ from licensedcode import cache
2858+
2859+ idx = cache .get_index ()
2860+
2861+ matches = idx .match (query_string = text )
2862+ if not matches :
2863+ return False
2864+ if len (matches ) > 1 :
2865+ return False
2866+ match = matches [0 ]
2867+ if match .matcher == MATCH_HASH and match .score () == 100 :
2868+ return match .rule .identifier
2869+
2870+
28172871def find_rule_base_location (name_prefix , rules_directory = rules_data_dir ):
28182872 """
28192873 Return a new, unique and non-existing base location in ``rules_directory``
@@ -2842,10 +2896,9 @@ def find_rule_base_location(name_prefix, rules_directory=rules_data_dir):
28422896 idx += 1
28432897
28442898
2845- def get_rules_by_expression (rules_data_dir = rules_data_dir ):
2899+ def get_rules_by_identifier (rules_data_dir = rules_data_dir ):
28462900 """
2847- Get a dictionary (sorted by license_expression) of {license_expression: rules}
2848- where `rules` is a list of all rule objects having the `license_expression`.
2901+ Get a dictionary of {rule_identifier: rule} for all license rules.
28492902 """
28502903 rules = list (load_rules (rules_data_dir = rules_data_dir ))
28512904
@@ -2854,6 +2907,14 @@ def get_rules_by_expression(rules_data_dir=rules_data_dir):
28542907 for rule in rules
28552908 }
28562909
2910+ return rules_by_identifier
2911+
2912+
2913+ def map_rules_by_expression (rules_by_identifier ):
2914+ """
2915+ Get a dictionary (sorted by license_expression) of {license_expression: rules}
2916+ from a dictionary of rules by their identifier.
2917+ """
28572918 rules_by_expression = defaultdict (list )
28582919
28592920 for rule in rules_by_identifier .values ():
@@ -2862,3 +2923,12 @@ def get_rules_by_expression(rules_data_dir=rules_data_dir):
28622923 rules_by_expression [rule .license_expression ].append (rule )
28632924
28642925 return OrderedDict (sorted (rules_by_expression .items ()))
2926+
2927+
2928+ def get_rules_by_expression (rules_data_dir = rules_data_dir ):
2929+ """
2930+ Get a dictionary (sorted by license_expression) of {license_expression: rules}
2931+ where `rules` is a list of all rule objects having the `license_expression`.
2932+ """
2933+ rules_by_identifier = get_rules_by_identifier (rules_data_dir )
2934+ return map_rules_by_expression (rules_by_identifier )
0 commit comments