Skip to content

Commit 25849d2

Browse files
committed
Add new phrase for 'extra-words` in rules
Add new phrases like `extra_phrase` this is special for extra-words. This phrase is represented in the format [[n]], where n indicates the maximum number of extra-words allowed at that position in the rule. If extra-words appear at the correct position and their count does not exceed the allowed limit `n`, then the score is increased to `100`. Signed-off-by: Alok Kumar <[email protected]>
1 parent 36a5bc2 commit 25849d2

File tree

9 files changed

+252
-12
lines changed

9 files changed

+252
-12
lines changed

src/licensedcode/data/rules/bsd-new_158.RULE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Redistributions in binary form must reproduce the above copyright
1414
notice, this list of conditions and the following disclaimer in the
1515
documentation and/or other materials provided with the distribution.
1616

17-
Neither the name of nor the names of its
17+
Neither the name of [[3]] nor the names of its
1818
contributors may be used to endorse or promote products derived from
1919
this software without specific prior written permission.
2020

src/licensedcode/index.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,9 @@ def _add_rules(
391391
# "weak" rules can only be matched with an automaton exactly.
392392
is_weak = True
393393

394+
# identify and capture the spans of extra phrases specified within the rule
395+
rule.extra_phrase_spans = list(rule.extra_phrases())
396+
394397
for rts in rule.tokens():
395398
rule_tokens_append(rts)
396399
rtid = dictionary_get(rts)

src/licensedcode/match.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -598,6 +598,12 @@ def score(self):
598598
in the matched range (including unknowns and unmatched) and the matched
599599
rule relevance.
600600
"""
601+
602+
# Check whether extra words in the matched text appear in allowed positions,
603+
# and do not exceed the maximum allowed word count at those positions.
604+
if is_extra_words_position_valid(self):
605+
return 100
606+
601607
# relevance is a number between 0 and 100. Divide by 100
602608
relevance = self.rule.relevance / 100
603609
if not relevance:
@@ -1071,6 +1077,53 @@ def merge_matches(matches, max_dist=None, trace=TRACE_MERGE):
10711077
# early from the loops: trying to check containment on wildly separated matches
10721078
# does not make sense
10731079

1080+
def is_extra_words_position_valid(match):
1081+
"""
1082+
Return True if the extra words appear in valid positions and
1083+
do not exceed the maximum allowed word count at those positions.
1084+
Otherwise, return False.
1085+
"""
1086+
1087+
rule_spans = match.ispan.subspans()
1088+
1089+
# If there are multiple subspans, it means not all required tokens are contiguous.
1090+
if len(rule_spans) > 1:
1091+
return False
1092+
1093+
matched_tokens = list(index_tokenizer(match.matched_text(whole_lines=False, highlight=False)))
1094+
rule_tokens = list(index_tokenizer(match.rule.text))
1095+
extra_phrase_spans = match.rule.extra_phrase_spans
1096+
1097+
if not extra_phrase_spans:
1098+
return False
1099+
1100+
# count of `extra-words` tokens i.e inserted in `matched_tokens`
1101+
matched_count = 0
1102+
1103+
# Count of extra phrase markers
1104+
extra_phrase_count = 0
1105+
1106+
for span, allowed_extra_words in extra_phrase_spans:
1107+
rule_index = span.start
1108+
allowed_extra_words = allowed_extra_words
1109+
1110+
matched_index = rule_index + matched_count - extra_phrase_count
1111+
extra_words_count = 0
1112+
1113+
# Count how many tokens in matched_text do not match the next rule token
1114+
while (matched_index < len(matched_tokens) and
1115+
matched_tokens[matched_index] != rule_tokens[rule_index + 1]):
1116+
matched_index += 1
1117+
matched_count += 1
1118+
extra_words_count += 1
1119+
1120+
extra_phrase_count += 1
1121+
1122+
if extra_words_count > allowed_extra_words:
1123+
return False
1124+
1125+
return True
1126+
10741127

10751128
def filter_contained_matches(
10761129
matches,

src/licensedcode/models.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10+
import re
1011
import os
1112
import sys
1213
import traceback
@@ -43,6 +44,7 @@
4344
from licensedcode.tokenize import index_tokenizer
4445
from licensedcode.tokenize import index_tokenizer_with_stopwords
4546
from licensedcode.tokenize import query_lines
47+
from licensedcode.tokenize import get_extra_phrase_spans
4648
from scancode.api import SCANCODE_LICENSEDB_URL
4749
from scancode.api import SCANCODE_LICENSE_URL
4850
from scancode.api import SCANCODE_RULE_URL
@@ -1683,6 +1685,17 @@ class BasicRule:
16831685
)
16841686
)
16851687

1688+
extra_phrase_spans = attr.ib(
1689+
default=attr.Factory(list),
1690+
repr=False,
1691+
metadata=dict(
1692+
help='List of tuples `(Span, int)` representing extra phrases for this rule.'
1693+
'Each tuple contains a Span of token positions in the rule text and an integer'
1694+
'indicating the maximum number of extra tokens allowed at that position.'
1695+
'extra phrases are enclosed in [[double square brackets]] in the rule text.'
1696+
)
1697+
)
1698+
16861699
source = attr.ib(
16871700
default=None,
16881701
repr=False,
@@ -2317,8 +2330,10 @@ def tokens(self):
23172330
"is_continuous", "minimum_coverage" and "stopword_by_pos" are
23182331
recomputed as a side effect.
23192332
"""
2333+
2334+
# remove extra_phrase marker from rules
2335+
text = remove_extra_phrase(self.text)
23202336

2321-
text = self.text
23222337
# We tag this rule as being a bare URL if it starts with a scheme and is
23232338
# on one line: this is used to determine a matching approach
23242339

@@ -2353,6 +2368,17 @@ def _set_continuous(self):
23532368
):
23542369
self.is_continuous = True
23552370

2371+
def extra_phrases(self):
2372+
"""
2373+
Return an iterable of `(Span, int)` tuples marking the positions of extra phrases in the rule text.
2374+
2375+
Each tuple consists of:
2376+
- a `Span` object representing the position in the tokenized rule text, and
2377+
- an integer `n` indicating how many extra tokens are allowed at that position.
2378+
"""
2379+
if self.text:
2380+
yield from get_extra_phrase_spans(self.text)
2381+
23562382
def build_required_phrase_spans(self):
23572383
"""
23582384
Return a list of Spans marking required phrases token positions of that must
@@ -2570,6 +2596,13 @@ def from_match_data(license_match_mapping):
25702596
return get_index().rules_by_id[rule_identifier]
25712597

25722598

2599+
def remove_extra_phrase(text):
2600+
"""
2601+
Remove extra phrase markers like [[n]], where the n is a digit.
2602+
"""
2603+
pattern = r'\[\[\d+\]\]'
2604+
return re.sub(pattern, '', text)
2605+
25732606
def compute_relevance(length):
25742607
"""
25752608
Return a computed ``relevance`` given a ``length`` and a threshold.

src/licensedcode/tokenize.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,12 +81,78 @@ def query_lines(
8181
required_phrase_pattern = '(?:' + query_pattern + '|\\{\\{|\\}\\})'
8282
required_phrase_splitter = re.compile(required_phrase_pattern, re.UNICODE).findall
8383

84+
85+
extra_phrase_pattern = r'(?:' + query_pattern + r'|\[\[|\]\])'
86+
extra_phrase_splitter = re.compile(extra_phrase_pattern, re.UNICODE).findall
87+
8488
REQUIRED_PHRASE_OPEN = '{{'
8589
REQUIRED_PHRASE_CLOSE = '}}'
8690

91+
EXTRA_PHRASE_OPEN ='[['
92+
EXTRA_PHRASE_CLOSE =']]'
93+
8794
# FIXME: this should be folded in a single pass tokenization with the index_tokenizer
8895

8996

97+
def extra_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False):
98+
"""
99+
Yield tokens from a rule ``text`` including extra phrases [[n]] markers.
100+
This n denotes maximum number of extra-words i.e valide at that position.
101+
This is same as ``required_phrase_tokenizer``.
102+
"""
103+
if not text:
104+
return
105+
if not preserve_case:
106+
text = text.lower()
107+
108+
for token in extra_phrase_splitter(text):
109+
if token and token not in stopwords:
110+
yield token
111+
112+
113+
def get_extra_phrase_spans(text):
114+
"""
115+
Return a list of tuples `(Span, int)`, one for each [[n]] extra phrase found in ``text``.
116+
Here, `n` should always be a digit token inside the extra phrase brackets.
117+
118+
Example:
119+
>>> text = 'Neither the name [[3]] of nor the names of its'
120+
>>> # 0 1 2 3 4 5 6 7 8 9
121+
>>> x = get_extra_phrase_spans(text)
122+
>>> assert x == [(Span([3]), 3)], x
123+
"""
124+
ipos = 0
125+
in_extra_phrase = False
126+
current_phrase_value = []
127+
extra_phrase_spans = []
128+
129+
for token in extra_phrase_tokenizer(text):
130+
if token == EXTRA_PHRASE_OPEN:
131+
in_extra_phrase = True
132+
current_phrase_value = []
133+
continue
134+
135+
elif token == EXTRA_PHRASE_CLOSE:
136+
if in_extra_phrase:
137+
# token must be digit and token must be present in double square bracket ``[[token]]``
138+
# and between extra phrases there must only one token exist
139+
if len(current_phrase_value) == 1 and current_phrase_value[0].isdigit():
140+
extra_phrase_spans.append((Span([ipos - 1]), int(current_phrase_value[0])))
141+
142+
in_extra_phrase = False
143+
current_phrase_value = []
144+
continue
145+
146+
if in_extra_phrase:
147+
# consider one token after double open square bracket ``[[``
148+
if len(current_phrase_value) == 0:
149+
current_phrase_value.append(token)
150+
151+
ipos += 1
152+
153+
return extra_phrase_spans
154+
155+
90156
def required_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False):
91157
"""
92158
Yield tokens from a rule ``text`` including required phrases {{brace}} markers.

tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
{
22
"license_detections": [
33
{
4-
"identifier": "bsd_new-fbfc5955-0c63-4c98-2ce9-08e1e1796f50",
4+
"identifier": "bsd_new-f3efb258-e9c3-604d-9bbd-147fe74af22e",
55
"license_expression": "bsd-new",
66
"license_expression_spdx": "BSD-3-Clause",
77
"detection_count": 1,
8-
"detection_log": [
9-
"extra-words"
10-
],
8+
"detection_log": [],
119
"reference_matches": [
1210
{
1311
"license_expression": "bsd-new",
@@ -16,7 +14,7 @@
1614
"start_line": 4,
1715
"end_line": 27,
1816
"matcher": "2-aho",
19-
"score": 99.53,
17+
"score": 100,
2018
"matched_length": 210,
2119
"match_coverage": 100.0,
2220
"rule_relevance": 100,
@@ -46,7 +44,7 @@
4644
"start_line": 4,
4745
"end_line": 27,
4846
"matcher": "2-aho",
49-
"score": 99.53,
47+
"score": 100,
5048
"matched_length": 210,
5149
"match_coverage": 100.0,
5250
"rule_relevance": 100,
@@ -56,10 +54,8 @@
5654
"matched_text_diagnostics": "Redistribution and use in source and binary forms, with or without\r\nmodification, are permitted provided that the following conditions are met:\r\n\r\n* Redistributions of source code must retain the above copyright notice, this\r\n list of conditions and the following disclaimer.\r\n\r\n* Redistributions in binary form must reproduce the above copyright notice,\r\n this list of conditions and the following disclaimer in the documentation\r\n and/or other materials provided with the distribution.\r\n\r\n* Neither the name of [filesize] nor the names of its\r\n contributors may be used to endorse or promote products derived from\r\n this software without specific prior written permission.\r\n\r\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\r\nAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r\nIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\r\nDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\r\nFOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\r\nSERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\r\nCAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r\nOR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r\nOF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."
5755
}
5856
],
59-
"detection_log": [
60-
"extra-words"
61-
],
62-
"identifier": "bsd_new-fbfc5955-0c63-4c98-2ce9-08e1e1796f50"
57+
"detection_log": [],
58+
"identifier": "bsd_new-f3efb258-e9c3-604d-9bbd-147fe74af22e"
6359
}
6460
],
6561
"license_clues": [],

tests/licensedcode/test_license_models.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,14 @@ def test_key_phrases_yields_spans(self):
591591
key_phrase_spans = list(rule.build_required_phrase_spans())
592592
assert key_phrase_spans == [Span(4), Span(7, 9)]
593593

594+
def test_extra_phrases_yields_spans(self):
595+
rule_text = (
596+
'Neither the name of [[3]] nor the names of its'
597+
)
598+
rule = models.Rule(license_expression='bsd-new', text=rule_text)
599+
extra_phrase_spans = list(rule.extra_phrases())
600+
assert extra_phrase_spans == [(Span(4),3)]
601+
594602
def test_key_phrases_raises_exception_when_markup_is_not_closed(self):
595603
rule_text = (
596604
'This released software is {{released}} by under {{the MIT license. '

tests/licensedcode/test_match.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1381,6 +1381,29 @@ def test_LicenseMatch_score_100_non_contiguous(self):
13811381
m1 = LicenseMatch(rule=r1, qspan=Span(0, 19) | Span(30, 51), ispan=Span(0, 41))
13821382
assert m1.score() == 80.77
13831383

1384+
def test_LicenseMatch_matches_score_100_for_extra_words_within_limit(self):
1385+
rule_text = 'Neither the name of [[3]] nor the names of its'
1386+
rule = create_rule_from_text_and_expression(license_expression='bsd_new', text=rule_text)
1387+
idx = index.LicenseIndex([rule])
1388+
1389+
query = 'Neither the name of XXX YYY ZZZ nor the names of its'
1390+
matches = idx.match(query_string=query, _skip_hash_match=True)
1391+
match = matches[0]
1392+
score = match.score()
1393+
assert score == 100
1394+
1395+
def test_LicenseMatch_matches_score_not_100_for_extra_words_exceed_limit(self):
1396+
rule_text = 'Neither the name of [[3]] nor the names of its'
1397+
rule = create_rule_from_text_and_expression(license_expression='bsd_new', text=rule_text)
1398+
idx = index.LicenseIndex([rule])
1399+
1400+
# The query includes 4 extra words instead of the allowed 3.
1401+
query = 'Neither the name of XXX YYY ZZZ AAA nor the names of its'
1402+
matches = idx.match(query_string=query, _skip_hash_match=True)
1403+
match = matches[0]
1404+
score = match.score()
1405+
assert score != 100
1406+
13841407
def test_LicenseMatch_stopwords_are_treated_as_unknown_2484(self):
13851408
rules_dir = self.get_test_loc('stopwords/index/rules')
13861409
lics_dir = self.get_test_loc('stopwords/index/licenses')

0 commit comments

Comments
 (0)