Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/licensedcode/data/rules/bsd-new_158.RULE
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.

Neither the name of nor the names of its
Neither the name of [[6]] nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

Expand Down
6 changes: 4 additions & 2 deletions src/licensedcode/data/rules/bsd-new_578.RULE
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ minimum_coverage: 99

Software License Agreement (BSD License)

Redistribution and use in source and binary forms, with or without
[[15]]

Redistribution and use [[4]] in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:

Expand All @@ -16,7 +18,7 @@ are met:
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of nor the names of its
* Neither the name of [[6]] nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.

Expand Down
24 changes: 24 additions & 0 deletions src/licensedcode/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from licensedcode.cache import get_licensing
from licensedcode.match import LicenseMatch
from licensedcode.match import set_matched_lines
from licensedcode.match import is_extra_words_position_valid
from licensedcode.models import compute_relevance
from licensedcode.models import Rule
from licensedcode.models import UnDetectedRule
Expand Down Expand Up @@ -110,6 +111,7 @@ class DetectionCategory(Enum):
PACKAGE_ADD_FROM_SIBLING_FILE = 'from-package-sibling-file'
PACKAGE_ADD_FROM_FILE = 'from-package-file'
EXTRA_WORDS = 'extra-words'
EXTRA_WORDS_PERMITTED = 'extra-words-permitted-in-rule'
UNKNOWN_MATCH = 'unknown-match'
LICENSE_CLUES = 'license-clues'
LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches'
Expand All @@ -129,6 +131,7 @@ class DetectionRule(Enum):
"""
UNKNOWN_MATCH = 'unknown-match'
EXTRA_WORDS = 'extra-words'
EXTRA_WORDS_PERMITTED = 'extra-words-permitted-in-rule'
LICENSE_CLUES = 'license-clues'
LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches'
IMPERFECT_COVERAGE = 'imperfect-match-coverage'
Expand Down Expand Up @@ -1072,6 +1075,7 @@ def is_correct_detection_non_unknown(license_matches):
is_correct_detection(license_matches)
and not has_unknown_matches(license_matches)
and not has_extra_words(license_matches)
and not is_extra_words_at_valid_positions(license_matches)
)


Expand Down Expand Up @@ -1159,6 +1163,16 @@ def has_low_rule_relevance(license_matches):
)


def is_extra_words_at_valid_positions(license_matches):
"""
Return True if any of the matches in ``license_matches`` List of LicenseMatch
has extra words are in the correct place.
"""
return any(
is_extra_words_position_valid(license_match)
for license_match in license_matches
)

def is_false_positive(license_matches, package_license=False):
"""
Return True if all of the matches in ``license_matches`` List of LicenseMatch
Expand Down Expand Up @@ -1570,6 +1584,12 @@ def get_detected_license_expression(
detection_log.append(DetectionRule.LOW_QUALITY_MATCH_FRAGMENTS.value)
return detection_log, combined_expression

elif analysis == DetectionCategory.EXTRA_WORDS_PERMITTED.value:
if TRACE_ANALYSIS:
logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS_PERMITTED.value}')
matches_for_expression = license_matches
detection_log.append(DetectionRule.EXTRA_WORDS_PERMITTED.value)

elif analysis == DetectionCategory.EXTRA_WORDS.value:
if TRACE_ANALYSIS:
logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS.value}')
Expand Down Expand Up @@ -1807,6 +1827,10 @@ def analyze_detection(license_matches, package_license=False):
threshold=IMPERFECT_MATCH_COVERAGE_THR,
):
return DetectionCategory.IMPERFECT_COVERAGE.value

# Case where `extra-words` are in the right place
elif is_extra_words_at_valid_positions(license_matches=license_matches):
return DetectionCategory.EXTRA_WORDS_PERMITTED.value

# Case where at least one of the match have extra words
elif has_extra_words(license_matches=license_matches):
Expand Down
57 changes: 57 additions & 0 deletions src/licensedcode/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,12 @@ def score(self):
in the matched range (including unknowns and unmatched) and the matched
rule relevance.
"""

# Check whether extra words in the matched text appear in allowed positions,
# and do not exceed the maximum allowed word count at those positions.
if is_extra_words_position_valid(match=self):
return 100

# relevance is a number between 0 and 100. Divide by 100
relevance = self.rule.relevance / 100
if not relevance:
Expand Down Expand Up @@ -1071,6 +1077,57 @@ def merge_matches(matches, max_dist=None, trace=TRACE_MERGE):
# early from the loops: trying to check containment on wildly separated matches
# does not make sense

def is_extra_words_position_valid(match):
"""
Return True if the extra words appear in valid positions and
do not exceed the maximum allowed word count at those positions.
Otherwise, return False.
"""

rule_spans = match.ispan.subspans()

# If there are multiple subspans, it means not all required tokens are contiguous.
if len(rule_spans) > 1:
return False

matched_tokens = list(index_tokenizer(match.matched_text(whole_lines=False, highlight=False)))
rule_tokens = list(index_tokenizer(match.rule.text))
extra_phrase_spans = match.rule.extra_phrase_spans

if not extra_phrase_spans:
return False

# count of `extra-words` tokens i.e inserted in `matched_tokens`
matched_count = 0

# Count of extra phrase markers
extra_phrase_count = 0

for span, allowed_extra_words in extra_phrase_spans:
rule_index = span.start - extra_phrase_count - 1
allowed_extra_words = allowed_extra_words

matched_index = span.start + matched_count - extra_phrase_count
extra_words_count = 0

# return false if token before `extra-words` in `matched_token` is not same as token before `extra-phrases` in `rule_tokens`
if(matched_tokens[matched_index-1] != rule_tokens[rule_index]):
return False

# Count how many tokens in `matched_text` do not match the next rule token
while (matched_index < len(matched_tokens) and
matched_tokens[matched_index] != rule_tokens[rule_index + 1]):
matched_index += 1
matched_count += 1
extra_words_count += 1

if extra_words_count > allowed_extra_words:
return False

extra_phrase_count += 1

return True


def filter_contained_matches(
matches,
Expand Down
41 changes: 41 additions & 0 deletions src/licensedcode/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# See https://aboutcode.org for more information about nexB OSS projects.
#

import re
import os
import sys
import traceback
Expand Down Expand Up @@ -43,6 +44,7 @@
from licensedcode.tokenize import index_tokenizer
from licensedcode.tokenize import index_tokenizer_with_stopwords
from licensedcode.tokenize import query_lines
from licensedcode.tokenize import get_extra_phrase_spans
from scancode.api import SCANCODE_LICENSEDB_URL
from scancode.api import SCANCODE_LICENSE_URL
from scancode.api import SCANCODE_RULE_URL
Expand Down Expand Up @@ -1683,6 +1685,17 @@ class BasicRule:
)
)

extra_phrase_spans = attr.ib(
default=attr.Factory(list),
repr=False,
metadata=dict(
help='List of tuples `(Span, int)` representing extra phrases for this rule.'
'Each tuple contains a Span of token positions in the rule text and an integer'
'indicating the maximum number of extra tokens allowed at that position.'
'extra phrases are enclosed in [[double square brackets]] in the rule text.'
)
)

source = attr.ib(
default=None,
repr=False,
Expand Down Expand Up @@ -2306,6 +2319,9 @@ def load_data(self, rule_file):
except Exception:
trace = traceback.format_exc()
raise InvalidRule(f'While loading: file://{rule_file}\n{trace}')

# remove extra_phrase marker from rules
self.text = remove_extra_phrase(self.text)

return self

Expand All @@ -2317,8 +2333,15 @@ def tokens(self):
"is_continuous", "minimum_coverage" and "stopword_by_pos" are
recomputed as a side effect.
"""

# identify and capture the spans of extra phrases specified within the rule
self.extra_phrase_spans = list(self.extra_phrases())

# remove extra_phrase marker from rules
self.text = remove_extra_phrase(self.text)

text = self.text

# We tag this rule as being a bare URL if it starts with a scheme and is
# on one line: this is used to determine a matching approach

Expand Down Expand Up @@ -2353,6 +2376,17 @@ def _set_continuous(self):
):
self.is_continuous = True

def extra_phrases(self):
"""
Return an iterable of `(Span, int)` tuples marking the positions of extra phrases in the rule text.

Each tuple consists of:
- a `Span` object representing the position in the tokenized rule text, and
- an integer `n` indicating how many extra tokens are allowed at that position.
"""
if self.text:
yield from get_extra_phrase_spans(self.text)

def build_required_phrase_spans(self):
"""
Return a list of Spans marking required phrases token positions of that must
Expand Down Expand Up @@ -2570,6 +2604,13 @@ def from_match_data(license_match_mapping):
return get_index().rules_by_id[rule_identifier]


def remove_extra_phrase(text):
"""
Remove extra phrase markers like [[n]], where the n is a digit.
"""
pattern = r'\[\[\d+\]\]'
return re.sub(pattern, '', text)

def compute_relevance(length):
"""
Return a computed ``relevance`` given a ``length`` and a threshold.
Expand Down
66 changes: 66 additions & 0 deletions src/licensedcode/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,78 @@ def query_lines(
required_phrase_pattern = '(?:' + query_pattern + '|\\{\\{|\\}\\})'
required_phrase_splitter = re.compile(required_phrase_pattern, re.UNICODE).findall


extra_phrase_pattern = r'(?:' + query_pattern + r'|\[\[|\]\])'
extra_phrase_splitter = re.compile(extra_phrase_pattern, re.UNICODE).findall

REQUIRED_PHRASE_OPEN = '{{'
REQUIRED_PHRASE_CLOSE = '}}'

EXTRA_PHRASE_OPEN ='[['
EXTRA_PHRASE_CLOSE =']]'

# FIXME: this should be folded in a single pass tokenization with the index_tokenizer


def extra_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False):
"""
Yield tokens from a rule ``text`` including extra phrases [[n]] markers.
This n denotes maximum number of extra-words i.e valide at that position.
This is same as ``required_phrase_tokenizer``.
"""
if not text:
return
if not preserve_case:
text = text.lower()

for token in extra_phrase_splitter(text):
if token and token not in stopwords:
yield token


def get_extra_phrase_spans(text):
"""
Return a list of tuples `(Span, int)`, one for each [[n]] extra phrase found in ``text``.
Here, `n` should always be a digit token inside the extra phrase brackets.

Example:
>>> text = 'Neither the name [[3]] of nor the names of its'
>>> # 0 1 2 3 4 5 6 7 8 9
>>> x = get_extra_phrase_spans(text)
>>> assert x == [(Span([3]), 3)], x
"""
ipos = 0
in_extra_phrase = False
current_phrase_value = []
extra_phrase_spans = []

for token in extra_phrase_tokenizer(text):
if token == EXTRA_PHRASE_OPEN:
in_extra_phrase = True
current_phrase_value = []
continue

elif token == EXTRA_PHRASE_CLOSE:
if in_extra_phrase:
# token must be digit and token must be present in double square bracket ``[[token]]``
# and between extra phrases there must only one token exist
if len(current_phrase_value) == 1 and current_phrase_value[0].isdigit():
extra_phrase_spans.append((Span([ipos - 1]), int(current_phrase_value[0])))

in_extra_phrase = False
current_phrase_value = []
continue

if in_extra_phrase:
# consider one token after double open square bracket ``[[``
if len(current_phrase_value) == 0:
current_phrase_value.append(token)

ipos += 1

return extra_phrase_spans


def required_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False):
"""
Yield tokens from a rule ``text`` including required phrases {{brace}} markers.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
{
"license_detections": [
{
"identifier": "bsd_new-fbfc5955-0c63-4c98-2ce9-08e1e1796f50",
"identifier": "bsd_new-f3efb258-e9c3-604d-9bbd-147fe74af22e",
"license_expression": "bsd-new",
"license_expression_spdx": "BSD-3-Clause",
"detection_count": 1,
"detection_log": [
"extra-words"
"extra-words-permitted-in-rule"
],
"reference_matches": [
{
Expand All @@ -16,7 +16,7 @@
"start_line": 4,
"end_line": 27,
"matcher": "2-aho",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to work for 2-aho but I don't think you've changed the 3-seq test, did that not work correctly as expected?

The score sees to be unchanged there: https://github.com/alok1304/scancode-toolkit/blob/extra_words/tests/licensedcode/data/plugin_license/extra-words/scan-extra-words-3-seq-license.expected.json#L49
Could you debug what happened?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In 3-seq it matches with other rules which have a different position of extra-words. I will do that for this, I will mark another position where extra-words may be present.

"score": 99.53,
"score": 100,
"matched_length": 210,
"match_coverage": 100.0,
"rule_relevance": 100,
Expand Down Expand Up @@ -46,7 +46,7 @@
"start_line": 4,
"end_line": 27,
"matcher": "2-aho",
"score": 99.53,
"score": 100,
"matched_length": 210,
"match_coverage": 100.0,
"rule_relevance": 100,
Expand All @@ -57,9 +57,9 @@
}
],
"detection_log": [
"extra-words"
"extra-words-permitted-in-rule"
],
"identifier": "bsd_new-fbfc5955-0c63-4c98-2ce9-08e1e1796f50"
"identifier": "bsd_new-f3efb258-e9c3-604d-9bbd-147fe74af22e"
}
],
"license_clues": [],
Expand Down
Loading
Loading