|
7 | 7 | # See https://aboutcode.org for more information about nexB OSS projects. |
8 | 8 | # |
9 | 9 |
|
10 | | -import click |
11 | 10 | import os |
12 | 11 | import sys |
13 | 12 | import traceback |
|
36 | 35 | from licensedcode import MIN_MATCH_HIGH_LENGTH |
37 | 36 | from licensedcode import MIN_MATCH_LENGTH |
38 | 37 | from licensedcode import SMALL_RULE |
39 | | -from licensedcode import TINY_RULE |
40 | 38 | from licensedcode.frontmatter import dumps_frontmatter |
41 | 39 | from licensedcode.frontmatter import load_frontmatter |
42 | 40 | from licensedcode.languages import LANG_INFO as known_languages |
43 | | -from licensedcode.spans import Span |
44 | 41 | from licensedcode.tokenize import index_tokenizer |
45 | 42 | from licensedcode.tokenize import index_tokenizer_with_stopwords |
46 | | -from licensedcode.tokenize import key_phrase_tokenizer |
47 | | -from licensedcode.tokenize import return_spans_for_key_phrase_in_text |
48 | | -from licensedcode.tokenize import get_ignorable_spans |
49 | | -from licensedcode.tokenize import get_non_overlapping_spans |
50 | | -from licensedcode.tokenize import add_key_phrase_markers |
51 | | -from licensedcode.tokenize import KEY_PHRASE_OPEN |
52 | | -from licensedcode.tokenize import KEY_PHRASE_CLOSE |
53 | 43 | from licensedcode.tokenize import query_lines |
54 | 44 | from scancode.api import SCANCODE_LICENSEDB_URL |
55 | 45 | from scancode.api import SCANCODE_LICENSE_URL |
@@ -2215,6 +2205,7 @@ def build_key_phrase_spans(self): |
2215 | 2205 | Return a list of Spans marking key phrases token positions of that must |
2216 | 2206 | be present for this rule to be matched. |
2217 | 2207 | """ |
| 2208 | + from licensedcode.required_phrases import get_key_phrase_spans |
2218 | 2209 | if self.is_from_license: |
2219 | 2210 | return [] |
2220 | 2211 | try: |
@@ -2870,140 +2861,3 @@ def get_rules_by_expression(rules_data_dir=rules_data_dir): |
2870 | 2861 | rules_by_expression[rule.license_expression].append(rule) |
2871 | 2862 |
|
2872 | 2863 | return OrderedDict(sorted(rules_by_expression.items())) |
2873 | | - |
2874 | | - |
2875 | | -def get_key_phrase_spans(text): |
2876 | | - """ |
2877 | | - Yield Spans of key phrase token positions found in the rule ``text``. |
2878 | | - Tokens form a key phrase when enclosed in {{double curly braces}}. |
2879 | | -
|
2880 | | - For example: |
2881 | | -
|
2882 | | - >>> text = 'This is enclosed in {{double curly braces}}' |
2883 | | - >>> # 0 1 2 3 4 5 6 |
2884 | | - >>> x = list(get_key_phrase_spans(text)) |
2885 | | - >>> assert x == [Span(4, 6)], x |
2886 | | -
|
2887 | | - >>> text = 'This is {{enclosed}} a {{double curly braces}} or not' |
2888 | | - >>> # 0 1 2 SW 3 4 5 6 7 |
2889 | | - >>> x = list(get_key_phrase_spans(text)) |
2890 | | - >>> assert x == [Span(2), Span(3, 5)], x |
2891 | | -
|
2892 | | - >>> text = 'This {{is}} enclosed a {{double curly braces}} or not' |
2893 | | - >>> # 0 1 2 SW 3 4 5 6 7 |
2894 | | - >>> x = list(get_key_phrase_spans(text)) |
2895 | | - >>> assert x == [Span([1]), Span([3, 4, 5])], x |
2896 | | -
|
2897 | | - >>> text = '{{AGPL-3.0 GNU Affero General Public License v3.0}}' |
2898 | | - >>> # 0 1 2 3 4 5 6 7 8 9 |
2899 | | - >>> x = list(get_key_phrase_spans(text)) |
2900 | | - >>> assert x == [Span(0, 9)], x |
2901 | | -
|
2902 | | - >>> assert list(get_key_phrase_spans('{This}')) == [] |
2903 | | -
|
2904 | | - >>> def check_exception(text): |
2905 | | - ... try: |
2906 | | - ... return list(get_key_phrase_spans(text)) |
2907 | | - ... except InvalidRule: |
2908 | | - ... pass |
2909 | | -
|
2910 | | - >>> check_exception('This {{is') |
2911 | | - >>> check_exception('This }}is') |
2912 | | - >>> check_exception('{{This }}is{{') |
2913 | | - >>> check_exception('This }}is{{') |
2914 | | - >>> check_exception('{{}}') |
2915 | | - >>> check_exception('{{This is') |
2916 | | - >>> check_exception('{{This is{{') |
2917 | | - >>> check_exception('{{This is{{ }}') |
2918 | | - >>> check_exception('{{{{This}}}}') |
2919 | | - >>> check_exception('}}This {{is}}') |
2920 | | - >>> check_exception('This }} {{is}}') |
2921 | | - >>> check_exception('{{This}}') |
2922 | | - [Span(0)] |
2923 | | - >>> check_exception('{This}') |
2924 | | - [] |
2925 | | - >>> check_exception('{{{This}}}') |
2926 | | - [Span(0)] |
2927 | | - """ |
2928 | | - ipos = 0 |
2929 | | - in_key_phrase = False |
2930 | | - key_phrase = [] |
2931 | | - for token in key_phrase_tokenizer(text): |
2932 | | - if token == KEY_PHRASE_OPEN: |
2933 | | - if in_key_phrase: |
2934 | | - raise InvalidRule('Invalid rule with nested key phrase {{ {{ braces', text) |
2935 | | - in_key_phrase = True |
2936 | | - |
2937 | | - elif token == KEY_PHRASE_CLOSE: |
2938 | | - if in_key_phrase: |
2939 | | - if key_phrase: |
2940 | | - yield Span(key_phrase) |
2941 | | - key_phrase.clear() |
2942 | | - else: |
2943 | | - raise InvalidRule('Invalid rule with empty key phrase {{}} braces', text) |
2944 | | - in_key_phrase = False |
2945 | | - else: |
2946 | | - raise InvalidRule(f'Invalid rule with dangling key phrase missing closing braces', text) |
2947 | | - continue |
2948 | | - else: |
2949 | | - if in_key_phrase: |
2950 | | - key_phrase.append(ipos) |
2951 | | - ipos += 1 |
2952 | | - |
2953 | | - if key_phrase or in_key_phrase: |
2954 | | - raise InvalidRule(f'Invalid rule with dangling key phrase missing final closing braces', text) |
2955 | | - |
2956 | | - |
2957 | | -def add_key_phrases_for_license_fields(licence_object, rules): |
2958 | | - |
2959 | | - license_fields_mapping_by_order = { |
2960 | | - "name": licence_object.name, |
2961 | | - "short_name": licence_object.short_name, |
2962 | | - #"key", |
2963 | | - #"spdx_license_key" |
2964 | | - } |
2965 | | - |
2966 | | - for rule in rules: |
2967 | | - # skip small rules |
2968 | | - if len(rule.text) < TINY_RULE: |
2969 | | - continue |
2970 | | - |
2971 | | - for license_field_value in license_fields_mapping_by_order.values(): |
2972 | | - |
2973 | | - # Reload from file as there could be changes from other license fields |
2974 | | - rule_file = os.path.join(rules_data_dir, rule.identifier) |
2975 | | - reloaded_rule = Rule.from_file(rule_file) |
2976 | | - |
2977 | | - # we get spans for name/short_name if they exist |
2978 | | - new_key_phrase_spans = return_spans_for_key_phrase_in_text( |
2979 | | - text=reloaded_rule.text, |
2980 | | - key_phrase=license_field_value |
2981 | | - ) |
2982 | | - |
2983 | | - # we get spans for already existing key phrases and ignorables |
2984 | | - ignorable_spans = get_ignorable_spans(reloaded_rule) |
2985 | | - old_key_phrase_spans = reloaded_rule.build_key_phrase_spans() |
2986 | | - |
2987 | | - # we verify whether there are spans which overlap with the |
2988 | | - # already present key phrases or ignorables |
2989 | | - spans_to_add = list( |
2990 | | - get_non_overlapping_spans( |
2991 | | - old_key_phrase_spans=old_key_phrase_spans + ignorable_spans, |
2992 | | - new_key_phrase_spans=new_key_phrase_spans |
2993 | | - ) |
2994 | | - ) |
2995 | | - |
2996 | | - text_rule = reloaded_rule.text |
2997 | | - |
2998 | | - # we add key phrase markers for the non-overlapping spans |
2999 | | - for span_to_add in spans_to_add: |
3000 | | - text_rule = add_key_phrase_markers( |
3001 | | - text=text_rule, |
3002 | | - key_phrase_span=span_to_add |
3003 | | - ) |
3004 | | - |
3005 | | - # write the rule on disk if there are any updates |
3006 | | - if text_rule != reloaded_rule.text: |
3007 | | - click.echo(f"Updating rule: {reloaded_rule.identifier}") |
3008 | | - reloaded_rule.text = text_rule |
3009 | | - reloaded_rule.dump(rules_data_dir) |
0 commit comments