Skip to content

Commit 16d3364

Browse files
committed
add more tests for is_extra_words_at_valid_positions and improve detection
Signed-off-by: Alok Kumar <[email protected]>
1 parent 5b933c0 commit 16d3364

File tree

3 files changed

+150
-18
lines changed

3 files changed

+150
-18
lines changed

src/licensedcode/detection.py

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from licensedcode.match import LicenseMatch
3232
from licensedcode.match import set_matched_lines
3333
from licensedcode.match import is_extra_words_position_valid
34+
from licensedcode.match import is_extra_words_at_valid_positions
3435
from licensedcode.models import compute_relevance
3536
from licensedcode.models import Rule
3637
from licensedcode.models import UnDetectedRule
@@ -1167,21 +1168,6 @@ def has_low_rule_relevance(license_matches):
11671168
)
11681169

11691170

1170-
def is_extra_words_at_valid_positions(license_matches):
1171-
"""
1172-
Return True if all the matches in `license_matches List of LicenseMatch
1173-
has extra words are in the correct place.
1174-
"""
1175-
for match in license_matches:
1176-
# check when we have `extra-words` detection
1177-
# if `query_coverage_coefficient` is positive number then 'extra-words` exit
1178-
if calculate_query_coverage_coefficient(match) > 0:
1179-
if not is_extra_words_position_valid(match):
1180-
return False
1181-
1182-
# at the end return True if all matches have no extra-wors or this extra-words are in the right place
1183-
return True
1184-
11851171
def is_false_positive(license_matches, package_license=False):
11861172
"""
11871173
Return True if all of the matches in ``license_matches`` List of LicenseMatch

src/licensedcode/match.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1133,18 +1133,27 @@ def is_extra_words_position_valid(match):
11331133

11341134
rule_index+=1
11351135

1136-
# check if any `extra-words` is present and return False because this `extra-words` are not at marked place
1136+
# check if any `extra-words` is present after checking all `extra-phrase-spans` in rules
11371137
while (matched_index < len(matched_tokens) and
11381138
matched_tokens[matched_index] == rule_tokens[rule_index]):
11391139
matched_index+=1
11401140
rule_index+=1
1141-
1141+
1142+
# some `extra-words` are found
11421143
if matched_index != len(matched_tokens):
11431144
return False
11441145

11451146
return True
11461147

11471148

1149+
def is_extra_words_at_valid_positions(license_matches):
1150+
"""
1151+
Return True if any of the matches in `license_matches` that have `extra-words`
1152+
are in the right place.
1153+
"""
1154+
return any(is_extra_words_position_valid(match) for match in license_matches)
1155+
1156+
11481157
def filter_contained_matches(
11491158
matches,
11501159
trace=TRACE_FILTER_CONTAINED,

tests/licensedcode/test_match.py

Lines changed: 138 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from licensedcode.match import get_full_matched_text
2222
from licensedcode.match import get_matching_regions
2323
from licensedcode.match import is_extra_words_position_valid
24+
from licensedcode.match import is_extra_words_at_valid_positions
2425
from licensedcode.match import LicenseMatch
2526
from licensedcode.match import merge_matches
2627
from licensedcode.match import reportable_tokens
@@ -1400,9 +1401,11 @@ def test_extra_words_at_wrong_position(self):
14001401
idx = index.LicenseIndex([rule])
14011402

14021403
query = """
1403-
Redistribution and amazing use in great source and binary forms are permitted.
1404+
Redistribution and amazing use in source and binary forms are permitted.
14041405
"""
1406+
# here 'amazing' word are at wrong place
14051407
match = idx.match(query_string=query, _skip_hash_match=True)[0]
1408+
14061409
assert is_extra_words_position_valid(match) is False
14071410

14081411
def test_exact_match_without_extra_markers(self):
@@ -1437,6 +1440,140 @@ def test_extra_words_one_at_right_place_and_one_at_not_right_place(self):
14371440
match = idx.match(query_string=query, _skip_hash_match=True)[0]
14381441
assert is_extra_words_position_valid(match) is False
14391442

1443+
def test_extra_words_if_one_match_have_extra_words_at_right_place_and_another_match_have_no_extra_words(self):
1444+
r1_text = "Redistribution and use [[3]] in source and binary forms are permitted."
1445+
r1 = create_rule_from_text_and_expression(
1446+
license_expression='extra-words',
1447+
text=r1_text
1448+
)
1449+
1450+
r2_text = "under the MIT license"
1451+
r2 = create_rule_from_text_and_expression(
1452+
license_expression='mit',
1453+
text=r2_text
1454+
)
1455+
1456+
idx = index.LicenseIndex([r1,r2])
1457+
1458+
query = """
1459+
Redistribution and use of this software in source and binary forms are permitted.
1460+
under the MIT license
1461+
"""
1462+
1463+
matches = idx.match(query_string=query, _skip_hash_match=True)
1464+
1465+
assert len(matches) == 2
1466+
assert is_extra_words_at_valid_positions(matches) is True
1467+
1468+
def test_extra_words_if_one_match_have_extra_words_at_right_place_but_exceed_limit_and_another_match_have_no_extra_words(self):
1469+
r1_text = "Redistribution and use [[3]] in source and binary forms are permitted."
1470+
r1 = create_rule_from_text_and_expression(
1471+
license_expression='extra-words',
1472+
text=r1_text
1473+
)
1474+
1475+
r2_text = "under the MIT license"
1476+
r2 = create_rule_from_text_and_expression(
1477+
license_expression='mit',
1478+
text=r2_text
1479+
)
1480+
1481+
idx = index.LicenseIndex([r1,r2])
1482+
1483+
query = """
1484+
Redistribution and use of this software AAA in source and binary forms are permitted.
1485+
under the MIT license
1486+
"""
1487+
1488+
matches = idx.match(query_string=query, _skip_hash_match=True)
1489+
1490+
assert len(matches) == 2
1491+
1492+
# one match have `extra-words` but it exceed the limit here there are
1493+
# four `extra-words` i.e 'of','this','software','AAA'
1494+
assert is_extra_words_at_valid_positions(matches) is False
1495+
1496+
def test_extra_words_if_all_match_have_no_extra_words(self):
1497+
r1_text = "Redistribution and use in source and binary forms are permitted."
1498+
r1 = create_rule_from_text_and_expression(
1499+
license_expression='extra-words',
1500+
text=r1_text
1501+
)
1502+
1503+
r2_text = "under the MIT license"
1504+
r2 = create_rule_from_text_and_expression(
1505+
license_expression='mit',
1506+
text=r2_text
1507+
)
1508+
1509+
idx = index.LicenseIndex([r1,r2])
1510+
1511+
query = """
1512+
Redistribution and use in source and binary forms are permitted.
1513+
under the MIT license
1514+
"""
1515+
1516+
matches = idx.match(query_string=query, _skip_hash_match=True)
1517+
1518+
assert len(matches) == 2
1519+
1520+
assert is_extra_words_at_valid_positions(matches) is False
1521+
1522+
def test_extra_words_if_one_match_have_extra_words_at_right_place_and_another_match_at_wrong_place(self):
1523+
r1_text = "Redistribution and use [[3]] in source and binary forms are permitted."
1524+
r1 = create_rule_from_text_and_expression(
1525+
license_expression='extra-words',
1526+
text=r1_text
1527+
)
1528+
1529+
r2_text = "Neither the name of [[3]] nor the names of its"
1530+
r2 = create_rule_from_text_and_expression(
1531+
license_expression='extra-words2',
1532+
text=r2_text
1533+
)
1534+
1535+
idx = index.LicenseIndex([r1,r2])
1536+
1537+
query = """
1538+
Redistribution and use of this software in source and binary forms are permitted.
1539+
Neither the name of William Henry James nor the names of Harris its
1540+
"""
1541+
1542+
matches = idx.match(query_string=query, _skip_hash_match=True)
1543+
1544+
assert len(matches) == 2
1545+
1546+
# one match have `extra-words` at correct place but another match
1547+
# have `extra-words` at correct place but one words 'Harris' at wrong place
1548+
# this `is_extra_words_at_valid_positions` return True because one match
1549+
# have `extra-words` at correct place
1550+
assert is_extra_words_at_valid_positions(matches) is True
1551+
1552+
def test_extra_words_all_match_have_extra_words_at_right_place(self):
1553+
r1_text = "Redistribution and use [[3]] in source and binary forms are permitted."
1554+
r1 = create_rule_from_text_and_expression(
1555+
license_expression='extra-words',
1556+
text=r1_text
1557+
)
1558+
1559+
r2_text = "Neither the name of [[3]] nor the names of its"
1560+
r2 = create_rule_from_text_and_expression(
1561+
license_expression='extra-words2',
1562+
text=r2_text
1563+
)
1564+
1565+
idx = index.LicenseIndex([r1,r2])
1566+
1567+
query = """
1568+
Redistribution and use of this software in source and binary forms are permitted.
1569+
Neither the name of William Henry James nor the names of its
1570+
"""
1571+
1572+
matches = idx.match(query_string=query, _skip_hash_match=True)
1573+
1574+
assert len(matches) == 2
1575+
assert is_extra_words_at_valid_positions(matches) is True
1576+
14401577

14411578
class TestLicenseMatchScore(FileBasedTesting):
14421579
test_data_dir = TEST_DATA_DIR

0 commit comments

Comments
 (0)