Skip to content

Commit 66f2be5

Browse files
Add required phrases from other rules
Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
1 parent 966adde commit 66f2be5

File tree

7 files changed

+417
-88
lines changed

7 files changed

+417
-88
lines changed

etc/scripts/licenses/buildrules.py

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from licensedcode import models
1717
from licensedcode import match_hash
1818
from licensedcode import frontmatter
19+
from licensedcode.models import rule_exists
1920
from license_expression import Licensing
2021

2122
"""
@@ -129,23 +130,6 @@ def load_data(location="00-new-licenses.txt"):
129130
return rules
130131

131132

132-
def rule_exists(text):
133-
"""
134-
Return the matched rule identifier if the text is an existing rule matched
135-
exactly, False otherwise.
136-
"""
137-
idx = cache.get_index()
138-
139-
matches = idx.match(query_string=text)
140-
if not matches:
141-
return False
142-
if len(matches) > 1:
143-
return False
144-
match = matches[0]
145-
if match.matcher == match_hash.MATCH_HASH and match.score() == 100:
146-
return match.rule.identifier
147-
148-
149133
def all_rule_by_tokens():
150134
"""
151135
Return a mapping of {tuples of tokens: rule id}, with one item for each

etc/scripts/licenses/report_license_rules.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@
6262
"is_license_reference",
6363
"is_license_intro",
6464
"is_license_clue",
65+
"is_required_phrase",
66+
"skip_creating_required_phrases",
6567
"is_deprecated",
6668
"has_unknown",
6769
"only_known_words",

src/licensedcode/models.py

Lines changed: 86 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1425,13 +1425,33 @@ class BasicRule:
14251425
'Mutually exclusive from any is_license_* flag')
14261426
)
14271427

1428+
is_required_phrase = attr.ib(
1429+
default=False,
1430+
repr=False,
1431+
metadata=dict(
1432+
help='True if this is rule text is a required phrase '
1433+
'A required phrase is often a part of another larger rule text '
1434+
'but is an essential section of the rule text which must be '
1435+
'present in the case of partial matches, otherwise the match '
1436+
'will be a false positive and misleading. ')
1437+
)
1438+
1439+
skip_creating_required_phrases = attr.ib(
1440+
default=False,
1441+
repr=False,
1442+
metadata=dict(
1443+
help='True if this rule needs to be skipped while creating '
1444+
'required phrase rules. Required phrase rules are created out '
1445+
'of other rule texts which have marked required phrases. ')
1446+
)
1447+
14281448
language = attr.ib(
14291449
default='en',
14301450
repr=False,
14311451
metadata=dict(
14321452
help='Two-letter ISO 639-1 language code if this license text is '
14331453
'not in English. See https://en.wikipedia.org/wiki/ISO_639-1 .')
1434-
)
1454+
)
14351455

14361456
minimum_coverage = attr.ib(
14371457
default=0,
@@ -1793,22 +1813,27 @@ def has_unknown(self):
17931813
# license flag instead
17941814
return self.license_expression and 'unknown' in self.license_expression
17951815

1796-
def validate(self, licensing=None, thorough=False):
1797-
"""
1798-
Validate this rule using the provided ``licensing`` Licensing and yield
1799-
one error message for each type of error detected.
1800-
"""
1801-
is_false_positive = self.is_false_positive
1802-
1803-
license_flags = (
1816+
@property
1817+
def license_flags(self):
1818+
return (
18041819
self.is_license_notice,
18051820
self.is_license_text,
18061821
self.is_license_reference,
18071822
self.is_license_tag,
18081823
self.is_license_intro,
18091824
self.is_license_clue,
1825+
self.is_required_phrase,
18101826
)
18111827

1828+
def validate(self, licensing=None, thorough=False):
1829+
"""
1830+
Validate this rule using the provided ``licensing`` Licensing and yield
1831+
one error message for each type of error detected.
1832+
"""
1833+
is_false_positive = self.is_false_positive
1834+
1835+
license_flags = self.license_flags
1836+
18121837
has_license_flags = any(license_flags)
18131838
has_many_license_flags = len([l for l in license_flags if l]) != 1
18141839

@@ -1961,6 +1986,7 @@ def get_flags_mapping(self):
19611986
'is_license_tag',
19621987
'is_license_intro',
19631988
'is_license_clue',
1989+
'is_required_phrase',
19641990
'is_continuous',
19651991
]
19661992

@@ -1987,6 +2013,8 @@ def to_reference(self):
19872013
data['is_license_tag'] = self.is_license_tag
19882014
data['is_license_intro'] = self.is_license_intro
19892015
data['is_license_clue'] = self.is_license_clue
2016+
data['is_required_phrase'] = self.is_required_phrase
2017+
data['skip_creating_required_phrases'] = self.skip_creating_required_phrases
19902018
data['is_continuous'] = self.is_continuous
19912019
data['is_builtin'] = self.is_builtin
19922020
data['is_from_license'] = self.is_from_license
@@ -2019,13 +2047,15 @@ def to_dict(self, include_text=False):
20192047

20202048
flags = (
20212049
'is_false_positive',
2050+
'is_required_phrase',
20222051
'is_license_text',
20232052
'is_license_notice',
20242053
'is_license_reference',
20252054
'is_license_tag',
20262055
'is_license_intro',
20272056
'is_license_clue',
20282057
'is_continuous',
2058+
'skip_creating_required_phrases',
20292059
'is_deprecated'
20302060
)
20312061

@@ -2206,11 +2236,11 @@ def build_key_phrase_spans(self):
22062236
Return a list of Spans marking key phrases token positions of that must
22072237
be present for this rule to be matched.
22082238
"""
2209-
from licensedcode.required_phrases import get_key_phrase_spans
2239+
from licensedcode.required_phrases import get_key_phrase_spans_or_tokens
22102240
if self.is_from_license:
22112241
return []
22122242
try:
2213-
return list(get_key_phrase_spans(self.text))
2243+
return list(get_key_phrase_spans_or_tokens(self.text))
22142244
except Exception as e:
22152245
raise InvalidRule(f'Invalid rule: {self}') from e
22162246

@@ -2241,7 +2271,7 @@ def compute_thresholds(self, small_rule=SMALL_RULE):
22412271

22422272
self.is_small = self.length < small_rule
22432273

2244-
def dump(self, rules_data_dir):
2274+
def dump(self, rules_data_dir, **kwargs):
22452275
"""
22462276
Dump a representation of this rule as a .RULE file stored in
22472277
``rules_data_dir`` as a UTF-8 file having:
@@ -2258,6 +2288,8 @@ def dump(self, rules_data_dir):
22582288
rule_file = self.rule_file(rules_data_dir=rules_data_dir)
22592289

22602290
metadata = self.to_dict()
2291+
if kwargs:
2292+
metadata.update(kwargs)
22612293
content = self.text
22622294
output = dumps_frontmatter(content=content, metadata=metadata)
22632295
with open(rule_file, 'w') as of:
@@ -2300,6 +2332,8 @@ def load(self, rule_file, with_checks=True):
23002332
self.license_expression = data.get('license_expression')
23012333

23022334
self.is_false_positive = data.get('is_false_positive', False)
2335+
self.is_required_phrase = data.get('is_required_phrase', False)
2336+
self.skip_creating_required_phrases = data.get('skip_creating_required_phrases', False)
23032337

23042338
relevance = as_int(float(data.get('relevance') or 0))
23052339
# Keep track if we have a stored relevance of not.
@@ -2359,7 +2393,7 @@ def set_relevance(self):
23592393
- relevance is computed based on the rule length
23602394
"""
23612395

2362-
if self.is_false_positive:
2396+
if self.is_false_positive or self.is_required_phrase:
23632397
self.relevance = 100
23642398
self.has_stored_relevance = True
23652399
return
@@ -2814,6 +2848,26 @@ def build_ignorables_mapping(copyrights, holders, authors, urls, emails):
28142848
return {k: v for k, v in sorted(ignorables.items()) if v}
28152849

28162850

2851+
def rule_exists(text):
2852+
"""
2853+
Return the matched rule if the text is an existing rule matched
2854+
exactly, False otherwise.
2855+
"""
2856+
from licensedcode.match_hash import MATCH_HASH
2857+
from licensedcode import cache
2858+
2859+
idx = cache.get_index()
2860+
2861+
matches = idx.match(query_string=text)
2862+
if not matches:
2863+
return False
2864+
if len(matches) > 1:
2865+
return False
2866+
match = matches[0]
2867+
if match.matcher == MATCH_HASH and match.score() == 100:
2868+
return match.rule.identifier
2869+
2870+
28172871
def find_rule_base_location(name_prefix, rules_directory=rules_data_dir):
28182872
"""
28192873
Return a new, unique and non-existing base location in ``rules_directory``
@@ -2842,10 +2896,9 @@ def find_rule_base_location(name_prefix, rules_directory=rules_data_dir):
28422896
idx += 1
28432897

28442898

2845-
def get_rules_by_expression(rules_data_dir=rules_data_dir):
2899+
def get_rules_by_identifier(rules_data_dir=rules_data_dir):
28462900
"""
2847-
Get a dictionary (sorted by license_expression) of {license_expression: rules}
2848-
where `rules` is a list of all rule objects having the `license_expression`.
2901+
Get a dictionary of {rule_identifier: rule} for all license rules.
28492902
"""
28502903
rules = list(load_rules(rules_data_dir=rules_data_dir))
28512904

@@ -2854,6 +2907,14 @@ def get_rules_by_expression(rules_data_dir=rules_data_dir):
28542907
for rule in rules
28552908
}
28562909

2910+
return rules_by_identifier
2911+
2912+
2913+
def map_rules_by_expression(rules_by_identifier):
2914+
"""
2915+
Get a dictionary (sorted by license_expression) of {license_expression: rules}
2916+
from a dictionary of rules by their identifier.
2917+
"""
28572918
rules_by_expression = defaultdict(list)
28582919

28592920
for rule in rules_by_identifier.values():
@@ -2862,3 +2923,12 @@ def get_rules_by_expression(rules_data_dir=rules_data_dir):
28622923
rules_by_expression[rule.license_expression].append(rule)
28632924

28642925
return OrderedDict(sorted(rules_by_expression.items()))
2926+
2927+
2928+
def get_rules_by_expression(rules_data_dir=rules_data_dir):
2929+
"""
2930+
Get a dictionary (sorted by license_expression) of {license_expression: rules}
2931+
where `rules` is a list of all rule objects having the `license_expression`.
2932+
"""
2933+
rules_by_identifier = get_rules_by_identifier(rules_data_dir)
2934+
return map_rules_by_expression(rules_by_identifier)

0 commit comments

Comments
 (0)