diff --git a/src/licensedcode/data/rules/bsd-new_158.RULE b/src/licensedcode/data/rules/bsd-new_158.RULE index 90af8ee4a67..b0835457774 100644 --- a/src/licensedcode/data/rules/bsd-new_158.RULE +++ b/src/licensedcode/data/rules/bsd-new_158.RULE @@ -14,7 +14,7 @@ Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -Neither the name of nor the names of its +Neither the name of [[6]] nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/src/licensedcode/data/rules/bsd-new_578.RULE b/src/licensedcode/data/rules/bsd-new_578.RULE index 99f1aad110e..ede4c9400b4 100644 --- a/src/licensedcode/data/rules/bsd-new_578.RULE +++ b/src/licensedcode/data/rules/bsd-new_578.RULE @@ -6,7 +6,9 @@ minimum_coverage: 99 Software License Agreement (BSD License) -Redistribution and use in source and binary forms, with or without +[[15]] + +Redistribution and use [[4]] in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -16,7 +18,7 @@ are met: copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of nor the names of its + * Neither the name of [[6]] nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/src/licensedcode/detection.py b/src/licensedcode/detection.py index 34cbe582e63..47410aba095 100644 --- a/src/licensedcode/detection.py +++ b/src/licensedcode/detection.py @@ -30,6 +30,7 @@ from licensedcode.cache import get_licensing from licensedcode.match import LicenseMatch from licensedcode.match import set_matched_lines +from licensedcode.match import is_extra_words_position_valid from licensedcode.models import compute_relevance from licensedcode.models import Rule from licensedcode.models import UnDetectedRule @@ -110,6 +111,7 @@ class DetectionCategory(Enum): PACKAGE_ADD_FROM_SIBLING_FILE = 'from-package-sibling-file' PACKAGE_ADD_FROM_FILE = 'from-package-file' EXTRA_WORDS = 'extra-words' + EXTRA_WORDS_PERMITTED = 'extra-words-permitted-in-rule' UNKNOWN_MATCH = 'unknown-match' LICENSE_CLUES = 'license-clues' LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches' @@ -129,6 +131,7 @@ class DetectionRule(Enum): """ UNKNOWN_MATCH = 'unknown-match' EXTRA_WORDS = 'extra-words' + EXTRA_WORDS_PERMITTED = 'extra-words-permitted-in-rule' LICENSE_CLUES = 'license-clues' LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches' IMPERFECT_COVERAGE = 'imperfect-match-coverage' @@ -1072,6 +1075,7 @@ def is_correct_detection_non_unknown(license_matches): is_correct_detection(license_matches) and not has_unknown_matches(license_matches) and not has_extra_words(license_matches) + and not is_extra_words_at_valid_positions(license_matches) ) @@ -1159,6 +1163,16 @@ def has_low_rule_relevance(license_matches): ) +def is_extra_words_at_valid_positions(license_matches): + """ + Return True if any of the matches in ``license_matches`` List of LicenseMatch + has extra words are in the correct place. + """ + return any( + is_extra_words_position_valid(license_match) + for license_match in license_matches + ) + def is_false_positive(license_matches, package_license=False): """ Return True if all of the matches in ``license_matches`` List of LicenseMatch @@ -1570,6 +1584,12 @@ def get_detected_license_expression( detection_log.append(DetectionRule.LOW_QUALITY_MATCH_FRAGMENTS.value) return detection_log, combined_expression + elif analysis == DetectionCategory.EXTRA_WORDS_PERMITTED.value: + if TRACE_ANALYSIS: + logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS_PERMITTED.value}') + matches_for_expression = license_matches + detection_log.append(DetectionRule.EXTRA_WORDS_PERMITTED.value) + elif analysis == DetectionCategory.EXTRA_WORDS.value: if TRACE_ANALYSIS: logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS.value}') @@ -1807,6 +1827,10 @@ def analyze_detection(license_matches, package_license=False): threshold=IMPERFECT_MATCH_COVERAGE_THR, ): return DetectionCategory.IMPERFECT_COVERAGE.value + + # Case where `extra-words` are in the right place + elif is_extra_words_at_valid_positions(license_matches=license_matches): + return DetectionCategory.EXTRA_WORDS_PERMITTED.value # Case where at least one of the match have extra words elif has_extra_words(license_matches=license_matches): diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py index 90eba30d55e..2e2b27bdde9 100644 --- a/src/licensedcode/match.py +++ b/src/licensedcode/match.py @@ -598,6 +598,12 @@ def score(self): in the matched range (including unknowns and unmatched) and the matched rule relevance. """ + + # Check whether extra words in the matched text appear in allowed positions, + # and do not exceed the maximum allowed word count at those positions. + if is_extra_words_position_valid(match=self): + return 100 + # relevance is a number between 0 and 100. Divide by 100 relevance = self.rule.relevance / 100 if not relevance: @@ -1071,6 +1077,57 @@ def merge_matches(matches, max_dist=None, trace=TRACE_MERGE): # early from the loops: trying to check containment on wildly separated matches # does not make sense +def is_extra_words_position_valid(match): + """ + Return True if the extra words appear in valid positions and + do not exceed the maximum allowed word count at those positions. + Otherwise, return False. + """ + + rule_spans = match.ispan.subspans() + + # If there are multiple subspans, it means not all required tokens are contiguous. + if len(rule_spans) > 1: + return False + + matched_tokens = list(index_tokenizer(match.matched_text(whole_lines=False, highlight=False))) + rule_tokens = list(index_tokenizer(match.rule.text)) + extra_phrase_spans = match.rule.extra_phrase_spans + + if not extra_phrase_spans: + return False + + # count of `extra-words` tokens i.e inserted in `matched_tokens` + matched_count = 0 + + # Count of extra phrase markers + extra_phrase_count = 0 + + for span, allowed_extra_words in extra_phrase_spans: + rule_index = span.start - extra_phrase_count - 1 + allowed_extra_words = allowed_extra_words + + matched_index = span.start + matched_count - extra_phrase_count + extra_words_count = 0 + + # return false if token before `extra-words` in `matched_token` is not same as token before `extra-phrases` in `rule_tokens` + if(matched_tokens[matched_index-1] != rule_tokens[rule_index]): + return False + + # Count how many tokens in `matched_text` do not match the next rule token + while (matched_index < len(matched_tokens) and + matched_tokens[matched_index] != rule_tokens[rule_index + 1]): + matched_index += 1 + matched_count += 1 + extra_words_count += 1 + + if extra_words_count > allowed_extra_words: + return False + + extra_phrase_count += 1 + + return True + def filter_contained_matches( matches, diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py index 354d93f52d3..47f0238660e 100644 --- a/src/licensedcode/models.py +++ b/src/licensedcode/models.py @@ -7,6 +7,7 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +import re import os import sys import traceback @@ -43,6 +44,7 @@ from licensedcode.tokenize import index_tokenizer from licensedcode.tokenize import index_tokenizer_with_stopwords from licensedcode.tokenize import query_lines +from licensedcode.tokenize import get_extra_phrase_spans from scancode.api import SCANCODE_LICENSEDB_URL from scancode.api import SCANCODE_LICENSE_URL from scancode.api import SCANCODE_RULE_URL @@ -1683,6 +1685,17 @@ class BasicRule: ) ) + extra_phrase_spans = attr.ib( + default=attr.Factory(list), + repr=False, + metadata=dict( + help='List of tuples `(Span, int)` representing extra phrases for this rule.' + 'Each tuple contains a Span of token positions in the rule text and an integer' + 'indicating the maximum number of extra tokens allowed at that position.' + 'extra phrases are enclosed in [[double square brackets]] in the rule text.' + ) + ) + source = attr.ib( default=None, repr=False, @@ -2306,6 +2319,9 @@ def load_data(self, rule_file): except Exception: trace = traceback.format_exc() raise InvalidRule(f'While loading: file://{rule_file}\n{trace}') + + # remove extra_phrase marker from rules + self.text = remove_extra_phrase(self.text) return self @@ -2317,8 +2333,15 @@ def tokens(self): "is_continuous", "minimum_coverage" and "stopword_by_pos" are recomputed as a side effect. """ + + # identify and capture the spans of extra phrases specified within the rule + self.extra_phrase_spans = list(self.extra_phrases()) + + # remove extra_phrase marker from rules + self.text = remove_extra_phrase(self.text) text = self.text + # We tag this rule as being a bare URL if it starts with a scheme and is # on one line: this is used to determine a matching approach @@ -2353,6 +2376,17 @@ def _set_continuous(self): ): self.is_continuous = True + def extra_phrases(self): + """ + Return an iterable of `(Span, int)` tuples marking the positions of extra phrases in the rule text. + + Each tuple consists of: + - a `Span` object representing the position in the tokenized rule text, and + - an integer `n` indicating how many extra tokens are allowed at that position. + """ + if self.text: + yield from get_extra_phrase_spans(self.text) + def build_required_phrase_spans(self): """ Return a list of Spans marking required phrases token positions of that must @@ -2570,6 +2604,13 @@ def from_match_data(license_match_mapping): return get_index().rules_by_id[rule_identifier] +def remove_extra_phrase(text): + """ + Remove extra phrase markers like [[n]], where the n is a digit. + """ + pattern = r'\[\[\d+\]\]' + return re.sub(pattern, '', text) + def compute_relevance(length): """ Return a computed ``relevance`` given a ``length`` and a threshold. diff --git a/src/licensedcode/tokenize.py b/src/licensedcode/tokenize.py index bea07dd5a21..dd7ad3356b1 100644 --- a/src/licensedcode/tokenize.py +++ b/src/licensedcode/tokenize.py @@ -81,12 +81,78 @@ def query_lines( required_phrase_pattern = '(?:' + query_pattern + '|\\{\\{|\\}\\})' required_phrase_splitter = re.compile(required_phrase_pattern, re.UNICODE).findall + +extra_phrase_pattern = r'(?:' + query_pattern + r'|\[\[|\]\])' +extra_phrase_splitter = re.compile(extra_phrase_pattern, re.UNICODE).findall + REQUIRED_PHRASE_OPEN = '{{' REQUIRED_PHRASE_CLOSE = '}}' +EXTRA_PHRASE_OPEN ='[[' +EXTRA_PHRASE_CLOSE =']]' + # FIXME: this should be folded in a single pass tokenization with the index_tokenizer +def extra_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False): + """ + Yield tokens from a rule ``text`` including extra phrases [[n]] markers. + This n denotes maximum number of extra-words i.e valide at that position. + This is same as ``required_phrase_tokenizer``. + """ + if not text: + return + if not preserve_case: + text = text.lower() + + for token in extra_phrase_splitter(text): + if token and token not in stopwords: + yield token + + +def get_extra_phrase_spans(text): + """ + Return a list of tuples `(Span, int)`, one for each [[n]] extra phrase found in ``text``. + Here, `n` should always be a digit token inside the extra phrase brackets. + + Example: + >>> text = 'Neither the name [[3]] of nor the names of its' + >>> # 0 1 2 3 4 5 6 7 8 9 + >>> x = get_extra_phrase_spans(text) + >>> assert x == [(Span([3]), 3)], x + """ + ipos = 0 + in_extra_phrase = False + current_phrase_value = [] + extra_phrase_spans = [] + + for token in extra_phrase_tokenizer(text): + if token == EXTRA_PHRASE_OPEN: + in_extra_phrase = True + current_phrase_value = [] + continue + + elif token == EXTRA_PHRASE_CLOSE: + if in_extra_phrase: + # token must be digit and token must be present in double square bracket ``[[token]]`` + # and between extra phrases there must only one token exist + if len(current_phrase_value) == 1 and current_phrase_value[0].isdigit(): + extra_phrase_spans.append((Span([ipos - 1]), int(current_phrase_value[0]))) + + in_extra_phrase = False + current_phrase_value = [] + continue + + if in_extra_phrase: + # consider one token after double open square bracket ``[[`` + if len(current_phrase_value) == 0: + current_phrase_value.append(token) + + ipos += 1 + + return extra_phrase_spans + + def required_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False): """ Yield tokens from a rule ``text`` including required phrases {{brace}} markers. diff --git a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json index fcdb8639ddb..469b7340eee 100644 --- a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json +++ b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-2-aho-license.expected.json @@ -1,12 +1,12 @@ { "license_detections": [ { - "identifier": "bsd_new-fbfc5955-0c63-4c98-2ce9-08e1e1796f50", + "identifier": "bsd_new-f3efb258-e9c3-604d-9bbd-147fe74af22e", "license_expression": "bsd-new", "license_expression_spdx": "BSD-3-Clause", "detection_count": 1, "detection_log": [ - "extra-words" + "extra-words-permitted-in-rule" ], "reference_matches": [ { @@ -16,7 +16,7 @@ "start_line": 4, "end_line": 27, "matcher": "2-aho", - "score": 99.53, + "score": 100, "matched_length": 210, "match_coverage": 100.0, "rule_relevance": 100, @@ -46,7 +46,7 @@ "start_line": 4, "end_line": 27, "matcher": "2-aho", - "score": 99.53, + "score": 100, "matched_length": 210, "match_coverage": 100.0, "rule_relevance": 100, @@ -57,9 +57,9 @@ } ], "detection_log": [ - "extra-words" + "extra-words-permitted-in-rule" ], - "identifier": "bsd_new-fbfc5955-0c63-4c98-2ce9-08e1e1796f50" + "identifier": "bsd_new-f3efb258-e9c3-604d-9bbd-147fe74af22e" } ], "license_clues": [], diff --git a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license.expected.json b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license.expected.json index c078b0be9f0..685252b79d3 100644 --- a/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license.expected.json +++ b/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license.expected.json @@ -1,70 +1,70 @@ -{ - "license_detections": [ - { - "identifier": "bsd_new-f757f201-d182-a694-093b-6c34d20e9f8e", - "license_expression": "bsd-new", - "license_expression_spdx": "BSD-3-Clause", - "detection_count": 1, - "detection_log": [ - "extra-words" - ], - "reference_matches": [ - { - "license_expression": "bsd-new", - "license_expression_spdx": "BSD-3-Clause", - "from_file": "scan-extra-words-3-seq-license/LICENSE", - "start_line": 1, - "end_line": 31, - "matcher": "3-seq", - "score": 92.67, - "matched_length": 215, - "match_coverage": 100.0, - "rule_relevance": 100, - "rule_identifier": "bsd-new_578.RULE", - "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-new_578.RULE", - "matched_text": "Software License Agreement (BSD License)\n\nCopyright (c) 2009-2015, Kevin Decker \n\nAll rights reserved.\n\nRedistribution and use of this software in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of Kevin Decker nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.", - "matched_text_diagnostics": "Software License Agreement (BSD License)\n\n[Copyright] ([c]) [2009]-[2015], [Kevin] [Decker] <[kpdecker]@[gmail].[com]>\n\n[All] [rights] [reserved].\n\nRedistribution and use [of] [this] [software] in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of [Kevin] [Decker] nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." - } - ] - } - ], - "files": [ - { - "path": "LICENSE", - "type": "file", - "detected_license_expression": "bsd-new", - "detected_license_expression_spdx": "BSD-3-Clause", - "license_detections": [ - { - "license_expression": "bsd-new", - "license_expression_spdx": "BSD-3-Clause", - "matches": [ - { - "license_expression": "bsd-new", - "license_expression_spdx": "BSD-3-Clause", - "from_file": "scan-extra-words-3-seq-license/LICENSE", - "start_line": 1, - "end_line": 31, - "matcher": "3-seq", - "score": 92.67, - "matched_length": 215, - "match_coverage": 100.0, - "rule_relevance": 100, - "rule_identifier": "bsd-new_578.RULE", - "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-new_578.RULE", - "matched_text": "Software License Agreement (BSD License)\n\nCopyright (c) 2009-2015, Kevin Decker \n\nAll rights reserved.\n\nRedistribution and use of this software in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of Kevin Decker nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.", - "matched_text_diagnostics": "Software License Agreement (BSD License)\n\n[Copyright] ([c]) [2009]-[2015], [Kevin] [Decker] <[kpdecker]@[gmail].[com]>\n\n[All] [rights] [reserved].\n\nRedistribution and use [of] [this] [software] in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of [Kevin] [Decker] nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." - } - ], - "detection_log": [ - "extra-words" - ], - "identifier": "bsd_new-f757f201-d182-a694-093b-6c34d20e9f8e" - } - ], - "license_clues": [], - "percentage_of_license_text": 92.67, - "scan_errors": [] - } - ] +{ + "license_detections": [ + { + "identifier": "bsd_new-f327c6f9-6086-8bd5-22c6-3aee3b99acf2", + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "detection_count": 1, + "detection_log": [ + "extra-words-permitted-in-rule" + ], + "reference_matches": [ + { + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "from_file": "scan-extra-words-3-seq-license/LICENSE", + "start_line": 1, + "end_line": 31, + "matcher": "3-seq", + "score": 100, + "matched_length": 215, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "bsd-new_578.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-new_578.RULE", + "matched_text": "Software License Agreement (BSD License)\n\nCopyright (c) 2009-2015, Kevin Decker \n\nAll rights reserved.\n\nRedistribution and use of this software in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of Kevin Decker nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.", + "matched_text_diagnostics": "Software License Agreement (BSD License)\n\n[Copyright] ([c]) [2009]-[2015], [Kevin] [Decker] <[kpdecker]@[gmail].[com]>\n\n[All] [rights] [reserved].\n\nRedistribution and use [of] [this] [software] in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of [Kevin] [Decker] nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." + } + ] + } + ], + "files": [ + { + "path": "LICENSE", + "type": "file", + "detected_license_expression": "bsd-new", + "detected_license_expression_spdx": "BSD-3-Clause", + "license_detections": [ + { + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "matches": [ + { + "license_expression": "bsd-new", + "license_expression_spdx": "BSD-3-Clause", + "from_file": "scan-extra-words-3-seq-license/LICENSE", + "start_line": 1, + "end_line": 31, + "matcher": "3-seq", + "score": 100, + "matched_length": 215, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "bsd-new_578.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-new_578.RULE", + "matched_text": "Software License Agreement (BSD License)\n\nCopyright (c) 2009-2015, Kevin Decker \n\nAll rights reserved.\n\nRedistribution and use of this software in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of Kevin Decker nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.", + "matched_text_diagnostics": "Software License Agreement (BSD License)\n\n[Copyright] ([c]) [2009]-[2015], [Kevin] [Decker] <[kpdecker]@[gmail].[com]>\n\n[All] [rights] [reserved].\n\nRedistribution and use [of] [this] [software] in source and binary forms, with or without modification,\nare permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above\n copyright notice, this list of conditions and the\n following disclaimer.\n\n* Redistributions in binary form must reproduce the above\n copyright notice, this list of conditions and the\n following disclaimer in the documentation and/or other\n materials provided with the distribution.\n\n* Neither the name of [Kevin] [Decker] nor the names of its\n contributors may be used to endorse or promote products\n derived from this software without specific prior\n written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\nIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\nFITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER\nIN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\nOF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE." + } + ], + "detection_log": [ + "extra-words-permitted-in-rule" + ], + "identifier": "bsd_new-f327c6f9-6086-8bd5-22c6-3aee3b99acf2" + } + ], + "license_clues": [], + "percentage_of_license_text": 92.67, + "scan_errors": [] + } + ] } \ No newline at end of file diff --git a/tests/licensedcode/test_license_models.py b/tests/licensedcode/test_license_models.py index 6c47d92a594..fec3dd7fdd5 100644 --- a/tests/licensedcode/test_license_models.py +++ b/tests/licensedcode/test_license_models.py @@ -591,6 +591,14 @@ def test_key_phrases_yields_spans(self): key_phrase_spans = list(rule.build_required_phrase_spans()) assert key_phrase_spans == [Span(4), Span(7, 9)] + def test_extra_phrases_yields_spans(self): + rule_text = ( + 'Neither the name of [[3]] nor the names of its' + ) + rule = models.Rule(license_expression='bsd-new', text=rule_text) + extra_phrase_spans = list(rule.extra_phrases()) + assert extra_phrase_spans == [(Span(4),3)] + def test_key_phrases_raises_exception_when_markup_is_not_closed(self): rule_text = ( 'This released software is {{released}} by under {{the MIT license. ' diff --git a/tests/licensedcode/test_match.py b/tests/licensedcode/test_match.py index 0afab2a7fd8..3302e1a57bc 100644 --- a/tests/licensedcode/test_match.py +++ b/tests/licensedcode/test_match.py @@ -20,6 +20,7 @@ from licensedcode.match import filter_overlapping_matches from licensedcode.match import get_full_matched_text from licensedcode.match import get_matching_regions +from licensedcode.match import is_extra_words_position_valid from licensedcode.match import LicenseMatch from licensedcode.match import merge_matches from licensedcode.match import reportable_tokens @@ -1321,6 +1322,106 @@ def test_get_matching_regions_3_lines_enough(self): assert matches[5].qspan in regions[1] +class TestExtraWordsPosition(FileBasedTesting): + test_data_dir = TEST_DATA_DIR + + def test_valid_extra_words_within_limit(self): + rule_text = """ + Redistribution and use [[4]] in source and binary forms are permitted. + """ + rule = create_rule_from_text_and_expression( + license_expression='extra-words', + text=rule_text + ) + idx = index.LicenseIndex([rule]) + + query = """ + Redistribution and use of this software in source and binary forms are permitted. + """ + match = idx.match(query_string=query, _skip_hash_match=True)[0] + assert is_extra_words_position_valid(match) is True + + def test_invalid_extra_words_exceed_limit(self): + rule_text = """ + Redistribution and use [[2]] in source and binary forms are permitted. + """ + rule = create_rule_from_text_and_expression( + license_expression='extra-words', + text=rule_text + ) + idx = index.LicenseIndex([rule]) + + query = """ + Redistribution and use of this software in source and binary forms are permitted. + """ + match = idx.match(query_string=query, _skip_hash_match=True)[0] + assert is_extra_words_position_valid(match) is False + + def test_no_extra_words_allowed(self): + rule_text = """ + Redistribution and use in source and binary forms are permitted. + """ + rule = create_rule_from_text_and_expression( + license_expression='extra-words', + text=rule_text + ) + idx = index.LicenseIndex([rule]) + + query = """ + Redistribution and use of software in source and binary forms are permitted. + """ + match = idx.match(query_string=query, _skip_hash_match=True)[0] + assert is_extra_words_position_valid(match) is False + + def test_multiple_extra_spans_valid(self): + rule_text = """ + Redistribution [[2]] and use [[1]] in source and binary forms are permitted. + """ + rule = create_rule_from_text_and_expression( + license_expression='extra-words', + text=rule_text + ) + idx = index.LicenseIndex([rule]) + + query = """ + Redistribution of content and use again in source and binary forms are permitted. + """ + match = idx.match(query_string=query, _skip_hash_match=True)[0] + assert is_extra_words_position_valid(match) is True + + def test_extra_words_at_wrong_position(self): + rule_text = """ + Redistribution and use [[2]] in source and binary forms are permitted. + """ + rule = create_rule_from_text_and_expression( + license_expression='extra-words', + text=rule_text + ) + idx = index.LicenseIndex([rule]) + + query = """ + Redistribution and amazing use in great source and binary forms are permitted. + """ + match = idx.match(query_string=query, _skip_hash_match=True)[0] + assert is_extra_words_position_valid(match) is False + + def test_exact_match_without_extra_markers(self): + rule_text = """ + Redistribution and use in source and binary forms are permitted. + """ + rule = create_rule_from_text_and_expression( + license_expression='extra-words', + text=rule_text + ) + idx = index.LicenseIndex([rule]) + + query = """ + Redistribution and use in source and binary forms are permitted. + """ + match = idx.match(query_string=query, _skip_hash_match=True)[0] + assert is_extra_words_position_valid(match) is False + + class TestLicenseMatchScore(FileBasedTesting): test_data_dir = TEST_DATA_DIR @@ -1381,6 +1482,29 @@ def test_LicenseMatch_score_100_non_contiguous(self): m1 = LicenseMatch(rule=r1, qspan=Span(0, 19) | Span(30, 51), ispan=Span(0, 41)) assert m1.score() == 80.77 + def test_LicenseMatch_matches_score_100_for_extra_words_within_limit(self): + rule_text = 'Neither the name of [[3]] nor the names of its' + rule = create_rule_from_text_and_expression(license_expression='bsd_new', text=rule_text) + idx = index.LicenseIndex([rule]) + + query = 'Neither the name of XXX YYY ZZZ nor the names of its' + matches = idx.match(query_string=query, _skip_hash_match=True) + match = matches[0] + score = match.score() + assert score == 100 + + def test_LicenseMatch_matches_score_not_100_for_extra_words_exceed_limit(self): + rule_text = 'Neither the name of [[3]] nor the names of its' + rule = create_rule_from_text_and_expression(license_expression='bsd_new', text=rule_text) + idx = index.LicenseIndex([rule]) + + # The query includes 4 extra words instead of the allowed 3. + query = 'Neither the name of XXX YYY ZZZ AAA nor the names of its' + matches = idx.match(query_string=query, _skip_hash_match=True) + match = matches[0] + score = match.score() + assert score != 100 + def test_LicenseMatch_stopwords_are_treated_as_unknown_2484(self): rules_dir = self.get_test_loc('stopwords/index/rules') lics_dir = self.get_test_loc('stopwords/index/licenses') diff --git a/tests/licensedcode/test_tokenize.py b/tests/licensedcode/test_tokenize.py index 950a94cf764..2639105094f 100644 --- a/tests/licensedcode/test_tokenize.py +++ b/tests/licensedcode/test_tokenize.py @@ -19,6 +19,7 @@ from licensedcode.spans import Span from licensedcode.tokenize import get_existing_required_phrase_spans +from licensedcode.tokenize import get_extra_phrase_spans from licensedcode.tokenize import index_tokenizer from licensedcode.tokenize import InvalidRuleRequiredPhrase from licensedcode.tokenize import matched_query_text_tokenizer @@ -26,6 +27,7 @@ from licensedcode.tokenize import query_lines from licensedcode.tokenize import query_tokenizer from licensedcode.tokenize import required_phrase_tokenizer +from licensedcode.tokenize import extra_phrase_tokenizer from licensedcode.tokenize import select_ngrams from licensedcode.tokenize import tokens_and_non_tokens from licensedcode.tokenize import word_splitter @@ -585,6 +587,62 @@ def test_get_existing_required_phrase_spans_with_markup(self): assert get_existing_required_phrase_spans(text=text) == [Span(18, 19)] +class TestExtraPhraseTokenizer(FileBasedTesting): + test_data_dir = TEST_DATA_DIR + + def test_extra_phrase_tokenizer_handles_empty_string(self): + text = '' + result = list(extra_phrase_tokenizer(text)) + assert result == [] + + def test_extra_phrase_tokenizer_handles_blank_lines(self): + text = u' \n\n\t ' + result = list(extra_phrase_tokenizer(text)) + assert result == [] + + def test_extra_phrase_tokenizer_handles_only_brackets(self): + text = '[[3]]' + assert list(extra_phrase_tokenizer(text)) == ['[[', '3', ']]'] + + def test_extra_phrase_tokenizer_parses_text_with_extra_phrase_marker(self): + text = 'Neither the name of [[3]] nor the names of its' + assert list(extra_phrase_tokenizer(text)) == [ + 'neither', 'the', 'name', 'of', '[[', '3', ']]', 'nor', 'the', 'names', 'of', 'its' + ] + + def test_get_extra_phrase_spans_simple(self): + text = 'This is [[2]] an example.' + spans = get_extra_phrase_spans(text) + assert spans == [(Span([2]), 2)] + + def test_get_extra_phrase_spans_multiple(self): + text = 'Some [[4]] text [[6]] with multiple markers.' + spans = get_extra_phrase_spans(text) + assert spans == [(Span([1]), 4), (Span([3]), 6)] + + def test_get_extra_phrase_spans_returns_nothing_if_none_found(self): + text = 'Just some normal text.' + assert get_extra_phrase_spans(text) == [] + + def test_get_extra_phrase_spans_ignores_non_numeric_values(self): + text = 'Just some [[normal]] text.' + assert get_extra_phrase_spans(text) == [] + + def test_extra_phrase_tokenizer_returns_same_word_tokens_as_index_tokenizer(self): + text = 'This [[1]] is a test.' + ep_tokens = [t for t in extra_phrase_tokenizer(text) if t not in ('[[', ']]')] + idx_tokens = list(index_tokenizer(text)) + assert ep_tokens == idx_tokens + + def test_get_extra_phrase_spans_ignores_unclosed_opening_bracket(self): + text = 'Neither the name of [[3 nor the names of its' + assert get_extra_phrase_spans(text) == [] + + def test_get_extra_phrase_spans_ignores_unopened_closing_bracket(self): + text = 'Neither the name of 3]] nor the names of its' + assert get_extra_phrase_spans(text) == [] + + class TestNgrams(FileBasedTesting): test_data_dir = TEST_DATA_DIR