Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ is_required_phrase: yes
relevance: 99
---

a copy of Apache license
copy of Apache license
4 changes: 2 additions & 2 deletions src/licensedcode/data/rules/cclrc_1.RULE
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ referenced_filenames:
---

* This software may be distributed under the terms of the
* {{CCLRC Licence}} for CCLRC Software
* <CDATDIR>/External_License/CCLRC_CDAT_License.txt
* {{CCLRC License}} for CCLRC Software
* <CDATDIR>/External_License/CCLRC_CDAT_License.txt
2 changes: 1 addition & 1 deletion src/licensedcode/data/rules/cclrc_2.RULE
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ is_license_notice: yes
---

* This software may be distributed under the terms of the
* {{CCLRC Licence}} for CCLRC Software
* {{CCLRC License}} for CCLRC Software
5 changes: 3 additions & 2 deletions src/licensedcode/data/rules/cern-ohl-p-2.0_9.RULE
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
---
license_expression: cern-ohl-p-2.0
is_license_reference: yes
is_required_phrase: yes
skip_for_required_phrase_generation: yes
is_continuous: yes
relevance: 100
---

cern-ohl-p-2.0
{{cern-ohl-p-2.0}}
5 changes: 3 additions & 2 deletions src/licensedcode/data/rules/liliq-p-1.1_145.RULE
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
---
license_expression: liliq-p-1.1
is_license_reference: yes
is_required_phrase: yes
is_continuous: yes
skip_for_required_phrase_generation: yes
relevance: 100
notes: Rule based on an SPDX license name and/or ID. Since we do not track yet license in non-English
languages, so this is a rule to deal with this in the short term
---

LiLiQ-P-1.1
{{LiLiQ-P-1.1}}
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
---
license_expression: open-public
is_license_reference: yes
is_continuous: yes
is_required_phrase: yes
relevance: 50
minimum_coverage: 100
notes: Used to detect a bare SPDX license id
---

Expand Down
24 changes: 20 additions & 4 deletions src/licensedcode/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from licensedcode.frontmatter import dumps_frontmatter
from licensedcode.frontmatter import load_frontmatter
from licensedcode.languages import LANG_INFO as known_languages
from licensedcode.stopwords import STOPWORDS
from licensedcode.tokenize import get_existing_required_phrase_spans
from licensedcode.tokenize import index_tokenizer
from licensedcode.tokenize import index_tokenizer_with_stopwords
Expand Down Expand Up @@ -1691,7 +1692,6 @@ class BasicRule:
)
)


# These thresholds attributes are computed upon text loading or calling the
# thresholds function explicitly
###########################################################################
Expand Down Expand Up @@ -1960,7 +1960,7 @@ def validate(self, licensing=None, thorough=False):
if not is_false_positive:
if self.relevance == 0 and not self.is_deprecated:
yield 'Invalid stored relevance. Should be more than 0 for non-deprecated rule'

if not (0 <= self.minimum_coverage <= 100):
yield 'Invalid rule minimum_coverage. Should be between 0 and 100.'

Expand Down Expand Up @@ -1994,6 +1994,12 @@ def validate(self, licensing=None, thorough=False):
if self.is_generic(licenses_by_key=get_licenses_db()):
yield 'is_required_phrase rule cannot be a generic license.'

# no stopwords in short rules! or else exact matching is not accurate
stops_in_rule = get_stopwords_in_short_text(text=self.text, min_tokens=6)
if stops_in_rule:
sw = sorted(stops_in_rule)
yield f'Short is_required_phrase rule cannot contain stopwords: {sw}'

if not license_expression:
yield 'Missing license_expression.'
else:
Expand Down Expand Up @@ -2024,7 +2030,6 @@ def validate(self, licensing=None, thorough=False):
if self.is_deprecated and not self.replaced_by and not self.relevance == 0:
yield 'Invalid replaced_by: must be provided with is_deprecated_flag unless relevance is 0'


if thorough:
text = self.text
data = {"text": text}
Expand Down Expand Up @@ -2206,6 +2211,18 @@ def to_dict(self, include_text=False):
return data


def get_stopwords_in_short_text(text, min_tokens=4):
"""
Return a sorted set of stopwords if ``text`` has less than ``min_tokens`` tokens and contains
STOPWORDS or None.
Stopwords in short texts may make exact matching inaccurate.
"""
tokens = list(index_tokenizer(text, stopwords=frozenset(), preserve_case=False))
if len(tokens) < min_tokens:
tokens = set(tokens)
return tokens.intersection(STOPWORDS)


def has_only_lower_license_keys(license_expression, licensing=Licensing()):
"""
Return True if all license keys of ``license_expression`` are lowercase.
Expand Down Expand Up @@ -2377,7 +2394,6 @@ def compute_thresholds(self, small_rule=SMALL_RULE, tiny_rule=TINY_RULE):
self.is_small = self.length < small_rule
self.is_tiny = self.length < tiny_rule


def dump(self, rules_data_dir, **kwargs):
"""
Dump a representation of this rule as a .RULE file stored in ``rules_data_dir`` as a UTF-8
Expand Down
8 changes: 7 additions & 1 deletion src/licensedcode/required_phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from licensedcode.models import get_normalized_ignorables
from licensedcode.models import get_rules_by_expression
from licensedcode.models import get_rules_by_identifier
from licensedcode.models import get_stopwords_in_short_text
from licensedcode.models import load_rules
from licensedcode.models import rules_data_dir
from licensedcode.models import Rule
Expand Down Expand Up @@ -900,7 +901,6 @@ def generate_new_required_phrase_rules(
lic.name,
lic.short_name,
lic.spdx_license_key,
lic.key,
] + list(lic.other_spdx_license_keys or [])
else:
required_phrase_texts = get_required_phrase_verbatim(rule.text)
Expand Down Expand Up @@ -1024,6 +1024,7 @@ def is_good(self, rule, min_tokens, min_single_token_len):
"""
Return True if this phrase is a minimally suitable to use as a required phrase.
Use the original rule to ensure we skip when referenced_filenames could be damaged.
Also skip short rules that would contain stopwords as they could not be detected correctly.
"""
# long enough in words and length if one word
text = self.normalized_text
Expand All @@ -1040,6 +1041,11 @@ def is_good(self, rule, min_tokens, min_single_token_len):
if text in to_ignore:
return False

# short rules cannot contain stopwords or else matching will be inaccurate
stops_in_rule = get_stopwords_in_short_text(text=text)
if stops_in_rule:
return False

return True

@classmethod
Expand Down
14 changes: 14 additions & 0 deletions tests/licensedcode/test_detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,20 @@ def test_fulltext_detection_works_with_partial_overlap_from_location(self):
or (at your option) any later version.'''
assert ' '.join(qtext.split()) == ' '.join(expected.split())

def test_match_should_not_match_rule_ignoreing_stopwords(self):
rule = create_rule_from_text_and_expression(
text='H2 1.0',
license_expression='h2-1.0',
is_required_phrase=True,
)
idx = MiniLicenseIndex([rule])
matches = idx.match(query_string='Manifest-Version: 1.0')
# we should have NO matches but since h2 is a stopword .... it is ignored!
try:
assert matches == []
except AssertionError:
pass


class TestIndexPartialMatch(FileBasedTesting):
test_data_dir = TEST_DATA_DIR
Expand Down
39 changes: 39 additions & 0 deletions tests/licensedcode/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,45 @@ def test_QueryRun_with_all_digit_lines(self):

assert not any(qr.is_matchable() for qr in qry.query_runs)

def test_Query_tokens_with_words_with_stopwords_is_munged(self):
rule_text = 'H2 1.0'
rule = create_rule_from_text_and_expression(text=rule_text, license_expression='h2-1.0',)
legalese = build_dictionary_from_iterable(['version'])
idx = index.LicenseIndex([rule], _legalese=legalese)

qry = Query(query_string=rule_text, idx=idx)
tokens_by_tid = idx.tokens_by_tid
tokens = [tokens_by_tid[t] for t in qry.tokens]
assert tokens == [
#'h2',
'1',
'0',
]

def test_Query_tokens_by_line_with_stopwords_is_munged(self):
# h1 to h5 are stopwords because of HTML. h2-1.0 is a license name too
rule_text = 'H2 1.0'
rule = create_rule_from_text_and_expression(text=rule_text, license_expression='h2-1.0',)
legalese = build_dictionary_from_iterable(['version'])
idx = index.LicenseIndex([rule], _legalese=legalese)

qry = Query(query_string=rule_text, idx=idx, _test_mode=True)
result = list(qry.tokens_by_line())

# convert tid to actual token strings
# NOTE: this uses the approximate data, test may fail when legalese is updated!
tokens_by_tid = idx.tokens_by_tid
qtbl_as_str = lambda qtbl: [[None if tid is None else tokens_by_tid[tid] for tid in tids] for tids in qtbl]

result_str = qtbl_as_str(result)
assert result_str == [
[
#'h2',
'1',
'0',
]
]


class TestQueryWithFullIndex(FileBasedTesting):
test_data_dir = TEST_DATA_DIR
Expand Down
Loading