Skip to content

Commit 9892ec2

Browse files
Add a new console script to add required phrases
* moves the script to licensedcode * adds a new console script `add-required-phrases` * move the functions and tests to seperate files Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
1 parent 9e49d80 commit 9892ec2

File tree

6 files changed

+258
-225
lines changed

6 files changed

+258
-225
lines changed

setup-mini.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ console_scripts =
158158
scancode-reindex-licenses = licensedcode.reindex:reindex_licenses
159159
scancode-license-data = licensedcode.license_db:dump_scancode_license_data
160160
regen-package-docs = packagedcode.regen_package_docs:regen_package_docs
161+
add-required-phrases = licensedcode.required_phrases:add_required_phrases
161162

162163
# These are configurations for ScanCode plugins as setuptools entry points.
163164
# Each plugin entry hast this form:

setup.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ console_scripts =
158158
scancode-reindex-licenses = licensedcode.reindex:reindex_licenses
159159
scancode-license-data = licensedcode.license_db:dump_scancode_license_data
160160
regen-package-docs = packagedcode.regen_package_docs:regen_package_docs
161+
add-required-phrases = licensedcode.required_phrases:add_required_phrases
161162

162163
# These are configurations for ScanCode plugins as setuptools entry points.
163164
# Each plugin entry hast this form:

src/licensedcode/models.py

Lines changed: 1 addition & 147 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10-
import click
1110
import os
1211
import sys
1312
import traceback
@@ -36,20 +35,11 @@
3635
from licensedcode import MIN_MATCH_HIGH_LENGTH
3736
from licensedcode import MIN_MATCH_LENGTH
3837
from licensedcode import SMALL_RULE
39-
from licensedcode import TINY_RULE
4038
from licensedcode.frontmatter import dumps_frontmatter
4139
from licensedcode.frontmatter import load_frontmatter
4240
from licensedcode.languages import LANG_INFO as known_languages
43-
from licensedcode.spans import Span
4441
from licensedcode.tokenize import index_tokenizer
4542
from licensedcode.tokenize import index_tokenizer_with_stopwords
46-
from licensedcode.tokenize import key_phrase_tokenizer
47-
from licensedcode.tokenize import return_spans_for_key_phrase_in_text
48-
from licensedcode.tokenize import get_ignorable_spans
49-
from licensedcode.tokenize import get_non_overlapping_spans
50-
from licensedcode.tokenize import add_key_phrase_markers
51-
from licensedcode.tokenize import KEY_PHRASE_OPEN
52-
from licensedcode.tokenize import KEY_PHRASE_CLOSE
5343
from licensedcode.tokenize import query_lines
5444
from scancode.api import SCANCODE_LICENSEDB_URL
5545
from scancode.api import SCANCODE_LICENSE_URL
@@ -2215,6 +2205,7 @@ def build_key_phrase_spans(self):
22152205
Return a list of Spans marking key phrases token positions of that must
22162206
be present for this rule to be matched.
22172207
"""
2208+
from licensedcode.required_phrases import get_key_phrase_spans
22182209
if self.is_from_license:
22192210
return []
22202211
try:
@@ -2870,140 +2861,3 @@ def get_rules_by_expression(rules_data_dir=rules_data_dir):
28702861
rules_by_expression[rule.license_expression].append(rule)
28712862

28722863
return OrderedDict(sorted(rules_by_expression.items()))
2873-
2874-
2875-
def get_key_phrase_spans(text):
2876-
"""
2877-
Yield Spans of key phrase token positions found in the rule ``text``.
2878-
Tokens form a key phrase when enclosed in {{double curly braces}}.
2879-
2880-
For example:
2881-
2882-
>>> text = 'This is enclosed in {{double curly braces}}'
2883-
>>> # 0 1 2 3 4 5 6
2884-
>>> x = list(get_key_phrase_spans(text))
2885-
>>> assert x == [Span(4, 6)], x
2886-
2887-
>>> text = 'This is {{enclosed}} a {{double curly braces}} or not'
2888-
>>> # 0 1 2 SW 3 4 5 6 7
2889-
>>> x = list(get_key_phrase_spans(text))
2890-
>>> assert x == [Span(2), Span(3, 5)], x
2891-
2892-
>>> text = 'This {{is}} enclosed a {{double curly braces}} or not'
2893-
>>> # 0 1 2 SW 3 4 5 6 7
2894-
>>> x = list(get_key_phrase_spans(text))
2895-
>>> assert x == [Span([1]), Span([3, 4, 5])], x
2896-
2897-
>>> text = '{{AGPL-3.0 GNU Affero General Public License v3.0}}'
2898-
>>> # 0 1 2 3 4 5 6 7 8 9
2899-
>>> x = list(get_key_phrase_spans(text))
2900-
>>> assert x == [Span(0, 9)], x
2901-
2902-
>>> assert list(get_key_phrase_spans('{This}')) == []
2903-
2904-
>>> def check_exception(text):
2905-
... try:
2906-
... return list(get_key_phrase_spans(text))
2907-
... except InvalidRule:
2908-
... pass
2909-
2910-
>>> check_exception('This {{is')
2911-
>>> check_exception('This }}is')
2912-
>>> check_exception('{{This }}is{{')
2913-
>>> check_exception('This }}is{{')
2914-
>>> check_exception('{{}}')
2915-
>>> check_exception('{{This is')
2916-
>>> check_exception('{{This is{{')
2917-
>>> check_exception('{{This is{{ }}')
2918-
>>> check_exception('{{{{This}}}}')
2919-
>>> check_exception('}}This {{is}}')
2920-
>>> check_exception('This }} {{is}}')
2921-
>>> check_exception('{{This}}')
2922-
[Span(0)]
2923-
>>> check_exception('{This}')
2924-
[]
2925-
>>> check_exception('{{{This}}}')
2926-
[Span(0)]
2927-
"""
2928-
ipos = 0
2929-
in_key_phrase = False
2930-
key_phrase = []
2931-
for token in key_phrase_tokenizer(text):
2932-
if token == KEY_PHRASE_OPEN:
2933-
if in_key_phrase:
2934-
raise InvalidRule('Invalid rule with nested key phrase {{ {{ braces', text)
2935-
in_key_phrase = True
2936-
2937-
elif token == KEY_PHRASE_CLOSE:
2938-
if in_key_phrase:
2939-
if key_phrase:
2940-
yield Span(key_phrase)
2941-
key_phrase.clear()
2942-
else:
2943-
raise InvalidRule('Invalid rule with empty key phrase {{}} braces', text)
2944-
in_key_phrase = False
2945-
else:
2946-
raise InvalidRule(f'Invalid rule with dangling key phrase missing closing braces', text)
2947-
continue
2948-
else:
2949-
if in_key_phrase:
2950-
key_phrase.append(ipos)
2951-
ipos += 1
2952-
2953-
if key_phrase or in_key_phrase:
2954-
raise InvalidRule(f'Invalid rule with dangling key phrase missing final closing braces', text)
2955-
2956-
2957-
def add_key_phrases_for_license_fields(licence_object, rules):
2958-
2959-
license_fields_mapping_by_order = {
2960-
"name": licence_object.name,
2961-
"short_name": licence_object.short_name,
2962-
#"key",
2963-
#"spdx_license_key"
2964-
}
2965-
2966-
for rule in rules:
2967-
# skip small rules
2968-
if len(rule.text) < TINY_RULE:
2969-
continue
2970-
2971-
for license_field_value in license_fields_mapping_by_order.values():
2972-
2973-
# Reload from file as there could be changes from other license fields
2974-
rule_file = os.path.join(rules_data_dir, rule.identifier)
2975-
reloaded_rule = Rule.from_file(rule_file)
2976-
2977-
# we get spans for name/short_name if they exist
2978-
new_key_phrase_spans = return_spans_for_key_phrase_in_text(
2979-
text=reloaded_rule.text,
2980-
key_phrase=license_field_value
2981-
)
2982-
2983-
# we get spans for already existing key phrases and ignorables
2984-
ignorable_spans = get_ignorable_spans(reloaded_rule)
2985-
old_key_phrase_spans = reloaded_rule.build_key_phrase_spans()
2986-
2987-
# we verify whether there are spans which overlap with the
2988-
# already present key phrases or ignorables
2989-
spans_to_add = list(
2990-
get_non_overlapping_spans(
2991-
old_key_phrase_spans=old_key_phrase_spans + ignorable_spans,
2992-
new_key_phrase_spans=new_key_phrase_spans
2993-
)
2994-
)
2995-
2996-
text_rule = reloaded_rule.text
2997-
2998-
# we add key phrase markers for the non-overlapping spans
2999-
for span_to_add in spans_to_add:
3000-
text_rule = add_key_phrase_markers(
3001-
text=text_rule,
3002-
key_phrase_span=span_to_add
3003-
)
3004-
3005-
# write the rule on disk if there are any updates
3006-
if text_rule != reloaded_rule.text:
3007-
click.echo(f"Updating rule: {reloaded_rule.identifier}")
3008-
reloaded_rule.text = text_rule
3009-
reloaded_rule.dump(rules_data_dir)

0 commit comments

Comments
 (0)