Skip to content

Commit b956432

Browse files
committed
Add new phrase for 'extra-words` in rules
Add new phrases like `extra_phrase` this is special for extra-words. This phrase is represented in the format [[n]], where n indicates the maximum number of extra-words allowed at that position in the rule. If extra-words appear at the correct position and their count does not exceed the allowed limit `n`, then the score is increased to `100`. Signed-off-by: Alok Kumar <[email protected]>
1 parent 3e5c913 commit b956432

File tree

10 files changed

+389
-9
lines changed

10 files changed

+389
-9
lines changed

src/licensedcode/data/rules/bsd-new_158.RULE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Redistributions in binary form must reproduce the above copyright
1414
notice, this list of conditions and the following disclaimer in the
1515
documentation and/or other materials provided with the distribution.
1616

17-
Neither the name of nor the names of its
17+
Neither the name of [[6]] nor the names of its
1818
contributors may be used to endorse or promote products derived from
1919
this software without specific prior written permission.
2020

src/licensedcode/data/rules/bsd-new_578.RULE

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ minimum_coverage: 99
66

77
Software License Agreement (BSD License)
88

9-
Redistribution and use in source and binary forms, with or without
9+
[[15]]
10+
11+
Redistribution and use [[4]] in source and binary forms, with or without
1012
modification, are permitted provided that the following conditions
1113
are met:
1214

@@ -16,7 +18,7 @@ are met:
1618
copyright notice, this list of conditions and the following
1719
disclaimer in the documentation and/or other materials provided
1820
with the distribution.
19-
* Neither the name of nor the names of its
21+
* Neither the name of [[6]] nor the names of its
2022
contributors may be used to endorse or promote products derived
2123
from this software without specific prior written permission.
2224

src/licensedcode/detection.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from licensedcode.cache import get_licensing
3131
from licensedcode.match import LicenseMatch
3232
from licensedcode.match import set_matched_lines
33+
from licensedcode.match import is_extra_words_position_valid
3334
from licensedcode.models import compute_relevance
3435
from licensedcode.models import Rule
3536
from licensedcode.models import UnDetectedRule
@@ -110,6 +111,7 @@ class DetectionCategory(Enum):
110111
PACKAGE_ADD_FROM_SIBLING_FILE = 'from-package-sibling-file'
111112
PACKAGE_ADD_FROM_FILE = 'from-package-file'
112113
EXTRA_WORDS = 'extra-words'
114+
EXTRA_WORDS_PERMITTED = 'extra-words-permitted-in-rule'
113115
UNKNOWN_MATCH = 'unknown-match'
114116
LICENSE_CLUES = 'license-clues'
115117
LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches'
@@ -129,6 +131,7 @@ class DetectionRule(Enum):
129131
"""
130132
UNKNOWN_MATCH = 'unknown-match'
131133
EXTRA_WORDS = 'extra-words'
134+
EXTRA_WORDS_PERMITTED = 'extra-words-permitted-in-rule'
132135
LICENSE_CLUES = 'license-clues'
133136
LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches'
134137
IMPERFECT_COVERAGE = 'imperfect-match-coverage'
@@ -1072,6 +1075,7 @@ def is_correct_detection_non_unknown(license_matches):
10721075
is_correct_detection(license_matches)
10731076
and not has_unknown_matches(license_matches)
10741077
and not has_extra_words(license_matches)
1078+
and not is_extra_words_at_valid_positions(license_matches)
10751079
)
10761080

10771081

@@ -1159,6 +1163,16 @@ def has_low_rule_relevance(license_matches):
11591163
)
11601164

11611165

1166+
def is_extra_words_at_valid_positions(license_matches):
1167+
"""
1168+
Return True if any of the matches in ``license_matches`` List of LicenseMatch
1169+
has extra words are in the correct place.
1170+
"""
1171+
return any(
1172+
is_extra_words_position_valid(license_match)
1173+
for license_match in license_matches
1174+
)
1175+
11621176
def is_false_positive(license_matches, package_license=False):
11631177
"""
11641178
Return True if all of the matches in ``license_matches`` List of LicenseMatch
@@ -1570,6 +1584,12 @@ def get_detected_license_expression(
15701584
detection_log.append(DetectionRule.LOW_QUALITY_MATCH_FRAGMENTS.value)
15711585
return detection_log, combined_expression
15721586

1587+
elif analysis == DetectionCategory.EXTRA_WORDS_PERMITTED.value:
1588+
if TRACE_ANALYSIS:
1589+
logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS_PERMITTED.value}')
1590+
matches_for_expression = license_matches
1591+
detection_log.append(DetectionRule.EXTRA_WORDS_PERMITTED.value)
1592+
15731593
elif analysis == DetectionCategory.EXTRA_WORDS.value:
15741594
if TRACE_ANALYSIS:
15751595
logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS.value}')
@@ -1807,6 +1827,10 @@ def analyze_detection(license_matches, package_license=False):
18071827
threshold=IMPERFECT_MATCH_COVERAGE_THR,
18081828
):
18091829
return DetectionCategory.IMPERFECT_COVERAGE.value
1830+
1831+
# Case where `extra-words` are in the right place
1832+
elif is_extra_words_at_valid_positions(license_matches=license_matches):
1833+
return DetectionCategory.EXTRA_WORDS_PERMITTED.value
18101834

18111835
# Case where at least one of the match have extra words
18121836
elif has_extra_words(license_matches=license_matches):

src/licensedcode/match.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -598,6 +598,12 @@ def score(self):
598598
in the matched range (including unknowns and unmatched) and the matched
599599
rule relevance.
600600
"""
601+
602+
# Check whether extra words in the matched text appear in allowed positions,
603+
# and do not exceed the maximum allowed word count at those positions.
604+
if is_extra_words_position_valid(match=self):
605+
return 100
606+
601607
# relevance is a number between 0 and 100. Divide by 100
602608
relevance = self.rule.relevance / 100
603609
if not relevance:
@@ -1071,6 +1077,57 @@ def merge_matches(matches, max_dist=None, trace=TRACE_MERGE):
10711077
# early from the loops: trying to check containment on wildly separated matches
10721078
# does not make sense
10731079

1080+
def is_extra_words_position_valid(match):
1081+
"""
1082+
Return True if the extra words appear in valid positions and
1083+
do not exceed the maximum allowed word count at those positions.
1084+
Otherwise, return False.
1085+
"""
1086+
1087+
rule_spans = match.ispan.subspans()
1088+
1089+
# If there are multiple subspans, it means not all required tokens are contiguous.
1090+
if len(rule_spans) > 1:
1091+
return False
1092+
1093+
matched_tokens = list(index_tokenizer(match.matched_text(whole_lines=False, highlight=False)))
1094+
rule_tokens = list(index_tokenizer(match.rule.text))
1095+
extra_phrase_spans = match.rule.extra_phrase_spans
1096+
1097+
if not extra_phrase_spans:
1098+
return False
1099+
1100+
# count of `extra-words` tokens i.e inserted in `matched_tokens`
1101+
matched_count = 0
1102+
1103+
# Count of extra phrase markers
1104+
extra_phrase_count = 0
1105+
1106+
for span, allowed_extra_words in extra_phrase_spans:
1107+
rule_index = span.start - extra_phrase_count - 1
1108+
allowed_extra_words = allowed_extra_words
1109+
1110+
matched_index = span.start + matched_count - extra_phrase_count
1111+
extra_words_count = 0
1112+
1113+
# return false if token before `extra-words` in `matched_token` is not same as token before `extra-phrases` in `rule_tokens`
1114+
if(matched_tokens[matched_index-1] != rule_tokens[rule_index]):
1115+
return False
1116+
1117+
# Count how many tokens in `matched_text` do not match the next rule token
1118+
while (matched_index < len(matched_tokens) and
1119+
matched_tokens[matched_index] != rule_tokens[rule_index + 1]):
1120+
matched_index += 1
1121+
matched_count += 1
1122+
extra_words_count += 1
1123+
1124+
if extra_words_count > allowed_extra_words:
1125+
return False
1126+
1127+
extra_phrase_count += 1
1128+
1129+
return True
1130+
10741131

10751132
def filter_contained_matches(
10761133
matches,

src/licensedcode/models.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10+
import re
1011
import os
1112
import sys
1213
import traceback
@@ -43,6 +44,7 @@
4344
from licensedcode.tokenize import index_tokenizer
4445
from licensedcode.tokenize import index_tokenizer_with_stopwords
4546
from licensedcode.tokenize import query_lines
47+
from licensedcode.tokenize import get_extra_phrase_spans
4648
from scancode.api import SCANCODE_LICENSEDB_URL
4749
from scancode.api import SCANCODE_LICENSE_URL
4850
from scancode.api import SCANCODE_RULE_URL
@@ -1683,6 +1685,17 @@ class BasicRule:
16831685
)
16841686
)
16851687

1688+
extra_phrase_spans = attr.ib(
1689+
default=attr.Factory(list),
1690+
repr=False,
1691+
metadata=dict(
1692+
help='List of tuples `(Span, int)` representing extra phrases for this rule.'
1693+
'Each tuple contains a Span of token positions in the rule text and an integer'
1694+
'indicating the maximum number of extra tokens allowed at that position.'
1695+
'extra phrases are enclosed in [[double square brackets]] in the rule text.'
1696+
)
1697+
)
1698+
16861699
source = attr.ib(
16871700
default=None,
16881701
repr=False,
@@ -2306,6 +2319,9 @@ def load_data(self, rule_file):
23062319
except Exception:
23072320
trace = traceback.format_exc()
23082321
raise InvalidRule(f'While loading: file://{rule_file}\n{trace}')
2322+
2323+
# remove extra_phrase marker from rules
2324+
self.text = remove_extra_phrase(self.text)
23092325

23102326
return self
23112327

@@ -2317,8 +2333,15 @@ def tokens(self):
23172333
"is_continuous", "minimum_coverage" and "stopword_by_pos" are
23182334
recomputed as a side effect.
23192335
"""
2336+
2337+
# identify and capture the spans of extra phrases specified within the rule
2338+
self.extra_phrase_spans = list(self.extra_phrases())
2339+
2340+
# remove extra_phrase marker from rules
2341+
self.text = remove_extra_phrase(self.text)
23202342

23212343
text = self.text
2344+
23222345
# We tag this rule as being a bare URL if it starts with a scheme and is
23232346
# on one line: this is used to determine a matching approach
23242347

@@ -2353,6 +2376,17 @@ def _set_continuous(self):
23532376
):
23542377
self.is_continuous = True
23552378

2379+
def extra_phrases(self):
2380+
"""
2381+
Return an iterable of `(Span, int)` tuples marking the positions of extra phrases in the rule text.
2382+
2383+
Each tuple consists of:
2384+
- a `Span` object representing the position in the tokenized rule text, and
2385+
- an integer `n` indicating how many extra tokens are allowed at that position.
2386+
"""
2387+
if self.text:
2388+
yield from get_extra_phrase_spans(self.text)
2389+
23562390
def build_required_phrase_spans(self):
23572391
"""
23582392
Return a list of Spans marking required phrases token positions of that must
@@ -2570,6 +2604,13 @@ def from_match_data(license_match_mapping):
25702604
return get_index().rules_by_id[rule_identifier]
25712605

25722606

2607+
def remove_extra_phrase(text):
2608+
"""
2609+
Remove extra phrase markers like [[n]], where the n is a digit.
2610+
"""
2611+
pattern = r'\[\[\d+\]\]'
2612+
return re.sub(pattern, '', text)
2613+
25732614
def compute_relevance(length):
25742615
"""
25752616
Return a computed ``relevance`` given a ``length`` and a threshold.

src/licensedcode/tokenize.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,12 +81,78 @@ def query_lines(
8181
required_phrase_pattern = '(?:' + query_pattern + '|\\{\\{|\\}\\})'
8282
required_phrase_splitter = re.compile(required_phrase_pattern, re.UNICODE).findall
8383

84+
85+
extra_phrase_pattern = r'(?:' + query_pattern + r'|\[\[|\]\])'
86+
extra_phrase_splitter = re.compile(extra_phrase_pattern, re.UNICODE).findall
87+
8488
REQUIRED_PHRASE_OPEN = '{{'
8589
REQUIRED_PHRASE_CLOSE = '}}'
8690

91+
EXTRA_PHRASE_OPEN ='[['
92+
EXTRA_PHRASE_CLOSE =']]'
93+
8794
# FIXME: this should be folded in a single pass tokenization with the index_tokenizer
8895

8996

97+
def extra_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False):
98+
"""
99+
Yield tokens from a rule ``text`` including extra phrases [[n]] markers.
100+
This n denotes maximum number of extra-words i.e valide at that position.
101+
This is same as ``required_phrase_tokenizer``.
102+
"""
103+
if not text:
104+
return
105+
if not preserve_case:
106+
text = text.lower()
107+
108+
for token in extra_phrase_splitter(text):
109+
if token and token not in stopwords:
110+
yield token
111+
112+
113+
def get_extra_phrase_spans(text):
114+
"""
115+
Return a list of tuples `(Span, int)`, one for each [[n]] extra phrase found in ``text``.
116+
Here, `n` should always be a digit token inside the extra phrase brackets.
117+
118+
Example:
119+
>>> text = 'Neither the name [[3]] of nor the names of its'
120+
>>> # 0 1 2 3 4 5 6 7 8 9
121+
>>> x = get_extra_phrase_spans(text)
122+
>>> assert x == [(Span([3]), 3)], x
123+
"""
124+
ipos = 0
125+
in_extra_phrase = False
126+
current_phrase_value = []
127+
extra_phrase_spans = []
128+
129+
for token in extra_phrase_tokenizer(text):
130+
if token == EXTRA_PHRASE_OPEN:
131+
in_extra_phrase = True
132+
current_phrase_value = []
133+
continue
134+
135+
elif token == EXTRA_PHRASE_CLOSE:
136+
if in_extra_phrase:
137+
# token must be digit and token must be present in double square bracket ``[[token]]``
138+
# and between extra phrases there must only one token exist
139+
if len(current_phrase_value) == 1 and current_phrase_value[0].isdigit():
140+
extra_phrase_spans.append((Span([ipos - 1]), int(current_phrase_value[0])))
141+
142+
in_extra_phrase = False
143+
current_phrase_value = []
144+
continue
145+
146+
if in_extra_phrase:
147+
# consider one token after double open square bracket ``[[``
148+
if len(current_phrase_value) == 0:
149+
current_phrase_value.append(token)
150+
151+
ipos += 1
152+
153+
return extra_phrase_spans
154+
155+
90156
def required_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False):
91157
"""
92158
Yield tokens from a rule ``text`` including required phrases {{brace}} markers.

tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
{
22
"license_detections": [
33
{
4-
"identifier": "bsd_new-fbfc5955-0c63-4c98-2ce9-08e1e1796f50",
4+
"identifier": "bsd_new-f3efb258-e9c3-604d-9bbd-147fe74af22e",
55
"license_expression": "bsd-new",
66
"license_expression_spdx": "BSD-3-Clause",
77
"detection_count": 1,
88
"detection_log": [
9-
"extra-words"
9+
"extra-words-permitted-in-rule"
1010
],
1111
"reference_matches": [
1212
{
@@ -16,7 +16,7 @@
1616
"start_line": 4,
1717
"end_line": 27,
1818
"matcher": "2-aho",
19-
"score": 99.53,
19+
"score": 100,
2020
"matched_length": 210,
2121
"match_coverage": 100.0,
2222
"rule_relevance": 100,
@@ -46,7 +46,7 @@
4646
"start_line": 4,
4747
"end_line": 27,
4848
"matcher": "2-aho",
49-
"score": 99.53,
49+
"score": 100,
5050
"matched_length": 210,
5151
"match_coverage": 100.0,
5252
"rule_relevance": 100,
@@ -57,9 +57,9 @@
5757
}
5858
],
5959
"detection_log": [
60-
"extra-words"
60+
"extra-words-permitted-in-rule"
6161
],
62-
"identifier": "bsd_new-fbfc5955-0c63-4c98-2ce9-08e1e1796f50"
62+
"identifier": "bsd_new-f3efb258-e9c3-604d-9bbd-147fe74af22e"
6363
}
6464
],
6565
"license_clues": [],

0 commit comments

Comments
 (0)