Skip to content
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/licensedcode/data/rules/bsd-new_158.RULE
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.

Neither the name of nor the names of its
Neither the name of [[6]] nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

Expand Down
2 changes: 1 addition & 1 deletion src/licensedcode/data/rules/bsd-new_newlib3.RULE
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ are permitted provided that the following conditions are met:
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the University nor the names of its contributors
* Neither the name of the [[3]] University nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.

Expand Down
27 changes: 23 additions & 4 deletions src/licensedcode/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
from licensedcode.cache import get_licensing
from licensedcode.match import LicenseMatch
from licensedcode.match import set_matched_lines
from licensedcode.match import is_extra_words_position_valid
from licensedcode.match import is_extra_words_at_valid_positions
from licensedcode.models import compute_relevance
from licensedcode.models import Rule
from licensedcode.models import UnDetectedRule
Expand Down Expand Up @@ -110,6 +112,7 @@ class DetectionCategory(Enum):
PACKAGE_ADD_FROM_SIBLING_FILE = 'from-package-sibling-file'
PACKAGE_ADD_FROM_FILE = 'from-package-file'
EXTRA_WORDS = 'extra-words'
EXTRA_WORDS_PERMITTED = 'extra-words-permitted-in-rule'
UNKNOWN_MATCH = 'unknown-match'
LICENSE_CLUES = 'license-clues'
LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches'
Expand All @@ -129,6 +132,7 @@ class DetectionRule(Enum):
"""
UNKNOWN_MATCH = 'unknown-match'
EXTRA_WORDS = 'extra-words'
EXTRA_WORDS_PERMITTED = 'extra-words-permitted-in-rule'
LICENSE_CLUES = 'license-clues'
LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches'
IMPERFECT_COVERAGE = 'imperfect-match-coverage'
Expand Down Expand Up @@ -405,8 +409,12 @@ def score(self):
by the length of a match to the overall detection length.
"""
length = self.length
weighted_scores = (m.score() * (m.len() / length) for m in self.matches)
return min([round(sum(weighted_scores), 2), 100])
for m in self.matches:
# Check whether extra words in the matched text appear in allowed positions,
# and do not exceed the maximum allowed word count at those positions.
score = 100 if is_extra_words_position_valid(m) else m.score()
weighted_scores += score * (m.len() / length)
return min([round(weighted_scores, 2), 100])

def append(
self,
Expand Down Expand Up @@ -1072,6 +1080,7 @@ def is_correct_detection_non_unknown(license_matches):
is_correct_detection(license_matches)
and not has_unknown_matches(license_matches)
and not has_extra_words(license_matches)
and not is_extra_words_at_valid_positions(license_matches)
)


Expand All @@ -1087,7 +1096,7 @@ def is_correct_detection(license_matches):
]

return (
all(matcher in ("1-hash", "1-spdx-id", "2-aho") for matcher in matchers)
all(matcher in ("1-hash", "1-spdx-id", "2-aho", "3-seq") for matcher in matchers)
and all(is_match_coverage_perfect)
)

Expand Down Expand Up @@ -1570,6 +1579,12 @@ def get_detected_license_expression(
detection_log.append(DetectionRule.LOW_QUALITY_MATCH_FRAGMENTS.value)
return detection_log, combined_expression

elif analysis == DetectionCategory.EXTRA_WORDS_PERMITTED.value:
if TRACE_ANALYSIS:
logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS_PERMITTED.value}')
matches_for_expression = license_matches
detection_log.append(DetectionRule.EXTRA_WORDS_PERMITTED.value)

elif analysis == DetectionCategory.EXTRA_WORDS.value:
if TRACE_ANALYSIS:
logger_debug(f'analysis {DetectionCategory.EXTRA_WORDS.value}')
Expand Down Expand Up @@ -1810,7 +1825,11 @@ def analyze_detection(license_matches, package_license=False):

# Case where at least one of the match have extra words
elif has_extra_words(license_matches=license_matches):
return DetectionCategory.EXTRA_WORDS.value
# Case where `extra-words` are in the right place
if is_extra_words_at_valid_positions(license_matches=license_matches):
return DetectionCategory.EXTRA_WORDS_PERMITTED.value
else:
return DetectionCategory.EXTRA_WORDS.value

# Cases where Match Coverage is a perfect 100 for all matches
else:
Expand Down
84 changes: 83 additions & 1 deletion src/licensedcode/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -826,7 +826,11 @@ def to_dict(
result['start_line'] = self.start_line
result['end_line'] = self.end_line
result['matcher'] = self.matcher
result['score'] = self.score()
# update score if `extra-words` are in right place
if(is_extra_words_position_valid(match=self)):
result['score'] = 100
else:
result['score'] = self.score()
result['matched_length'] = self.len()
result['match_coverage'] = self.coverage()
result['rule_relevance'] = self.rule.relevance
Expand Down Expand Up @@ -1071,6 +1075,84 @@ def merge_matches(matches, max_dist=None, trace=TRACE_MERGE):
# early from the loops: trying to check containment on wildly separated matches
# does not make sense

def is_extra_words_position_valid(match):
"""
Return True if the extra words appear in valid positions and
do not exceed the maximum allowed word count at those positions.
Otherwise, return False.
"""
# Find `query_coverage_coefficient` such that match have `extra-words` or not
score_coverage_relevance = (
match.coverage() * match.rule.relevance
) / 100

# Calculate the query coverage coefficient
query_coverage_coefficient = score_coverage_relevance - match.score()

# Return False if the match has no extra words
if query_coverage_coefficient == 0:
return False

matched_tokens = list(index_tokenizer(match.matched_text(whole_lines=False, highlight=False)))
rule_tokens = list(index_tokenizer(match.rule.text))
extra_phrase_spans = match.rule.extra_phrase_spans

if not extra_phrase_spans:
return False

# count of `extra-words` tokens i.e inserted in `matched_tokens`
matched_count = 0

# Count of extra phrase markers
extra_phrase_count = 0

rule_index = 0
matched_index = 0

for span, allowed_extra_word in extra_phrase_spans:
rule_index = span.start

matched_index = span.start + matched_count - extra_phrase_count
extra_words_count = 0

# return false if token before `extra-words` in `matched_token` is not same as token before `extra-phrases` in `rule_tokens`
if(matched_tokens[matched_index-1] != rule_tokens[rule_index-1]):
return False

# Count how many tokens in `matched_text` do not match the next rule token
while (matched_index < len(matched_tokens) and
matched_tokens[matched_index] != rule_tokens[rule_index + 1]):
matched_index += 1
matched_count += 1
extra_words_count += 1

if extra_words_count > allowed_extra_word:
return False

extra_phrase_count += 1

rule_index+=1

# check if any `extra-words` is present after checking all `extra-phrase-spans` in rules
while (matched_index < len(matched_tokens) and
matched_tokens[matched_index] == rule_tokens[rule_index]):
matched_index+=1
rule_index+=1

# some `extra-words` are found
if matched_index != len(matched_tokens):
return False

return True


def is_extra_words_at_valid_positions(license_matches):
"""
Return True if any of the matches in `license_matches` that have `extra-words`
are in the right place.
"""
return any(is_extra_words_position_valid(match) for match in license_matches)


def filter_contained_matches(
matches,
Expand Down
26 changes: 26 additions & 0 deletions src/licensedcode/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from licensedcode.tokenize import index_tokenizer
from licensedcode.tokenize import index_tokenizer_with_stopwords
from licensedcode.tokenize import query_lines
from licensedcode.tokenize import get_extra_phrase_spans
from scancode.api import SCANCODE_LICENSEDB_URL
from scancode.api import SCANCODE_LICENSE_URL
from scancode.api import SCANCODE_RULE_URL
Expand Down Expand Up @@ -1683,6 +1684,17 @@ class BasicRule:
)
)

extra_phrase_spans = attr.ib(
default=attr.Factory(list),
repr=False,
metadata=dict(
help='List of tuples `(Span, int)` representing extra phrases for this rule.'
'Each tuple contains a Span of token positions in the rule text and an integer'
'indicating the maximum number of extra tokens allowed at that position.'
'extra phrases are enclosed in [[double square brackets]] in the rule text.'
)
)

source = attr.ib(
default=None,
repr=False,
Expand Down Expand Up @@ -2317,6 +2329,9 @@ def tokens(self):
"is_continuous", "minimum_coverage" and "stopword_by_pos" are
recomputed as a side effect.
"""

# identify and capture the spans of extra phrases specified within the rule
self.extra_phrase_spans = list(self.extra_phrases())

text = self.text
# We tag this rule as being a bare URL if it starts with a scheme and is
Expand Down Expand Up @@ -2353,6 +2368,17 @@ def _set_continuous(self):
):
self.is_continuous = True

def extra_phrases(self):
"""
Return an iterable of `(Span, int)` tuples marking the positions of extra phrases in the rule text.

Each tuple consists of:
- a `Span` object representing the position in the tokenized rule text, and
- an integer `n` indicating how many extra tokens are allowed at that position.
"""
if self.text:
yield from get_extra_phrase_spans(self.text)

def build_required_phrase_spans(self):
"""
Return a list of Spans marking required phrases token positions of that must
Expand Down
72 changes: 72 additions & 0 deletions src/licensedcode/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,82 @@ def query_lines(
required_phrase_pattern = '(?:' + query_pattern + '|\\{\\{|\\}\\})'
required_phrase_splitter = re.compile(required_phrase_pattern, re.UNICODE).findall


extra_phrase_pattern = '(?:' + query_pattern + r'|\[\[|\]\])'
extra_phrase_splitter = re.compile(extra_phrase_pattern, re.UNICODE).findall


# pattern to match and remove extra phrases like [[1]], [[4]]..etc from the text
extra_phrase_removal_pattern = re.compile(r'\[\[\d+\]\]')

REQUIRED_PHRASE_OPEN = '{{'
REQUIRED_PHRASE_CLOSE = '}}'

EXTRA_PHRASE_OPEN ='[['
EXTRA_PHRASE_CLOSE =']]'

# FIXME: this should be folded in a single pass tokenization with the index_tokenizer


def extra_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False):
"""
Yield tokens from a rule ``text`` including extra phrases [[n]] markers.
This n denotes maximum number of extra-words i.e valide at that position.
This is same as ``required_phrase_tokenizer``.
"""
if not text:
return
if not preserve_case:
text = text.lower()

for token in extra_phrase_splitter(text):
if token and token not in stopwords:
yield token


def get_extra_phrase_spans(text):
"""
Return a list of tuples `(Span, int)`, one for each [[n]] extra phrase found in ``text``.
Here, `n` should always be a digit token inside the extra phrase brackets.

Example:
>>> text = 'Neither the name [[3]] of nor the names of its'
>>> # 0 1 2 3 4 5 6 7 8 9
>>> x = get_extra_phrase_spans(text)
>>> assert x == [(Span([3]), 3)], x
"""
ipos = 0
in_extra_phrase = False
current_phrase_value = []
extra_phrase_spans = []

for token in extra_phrase_tokenizer(text):
if token == EXTRA_PHRASE_OPEN:
in_extra_phrase = True
current_phrase_value = []
continue

elif token == EXTRA_PHRASE_CLOSE:
if in_extra_phrase:
# token must be digit and token must be present in double square bracket ``[[token]]``
# and between extra phrases there must only one token exist
if len(current_phrase_value) == 1 and current_phrase_value[0].isdigit():
extra_phrase_spans.append((Span([ipos - 1]), int(current_phrase_value[0])))

in_extra_phrase = False
current_phrase_value = []
continue

if in_extra_phrase:
# consider one token after double open square bracket ``[[``
if len(current_phrase_value) == 0:
current_phrase_value.append(token)

ipos += 1

return extra_phrase_spans


def required_phrase_tokenizer(text, stopwords=STOPWORDS, preserve_case=False):
"""
Yield tokens from a rule ``text`` including required phrases {{brace}} markers.
Expand Down Expand Up @@ -282,6 +352,8 @@ def index_tokenizer_with_stopwords(text, stopwords=STOPWORDS):
"""
if not text:
return [], {}

text = extra_phrase_removal_pattern.sub('', text)

tokens = []
tokens_append = tokens.append
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"license_expression_spdx": "BSD-3-Clause",
"detection_count": 1,
"detection_log": [
"extra-words"
"extra-words-permitted-in-rule"
],
"reference_matches": [
{
Expand All @@ -16,7 +16,7 @@
"start_line": 4,
"end_line": 27,
"matcher": "2-aho",
"score": 99.53,
"score": 100,
"matched_length": 210,
"match_coverage": 100.0,
"rule_relevance": 100,
Expand Down Expand Up @@ -46,7 +46,7 @@
"start_line": 4,
"end_line": 27,
"matcher": "2-aho",
"score": 99.53,
"score": 100,
"matched_length": 210,
"match_coverage": 100.0,
"rule_relevance": 100,
Expand All @@ -57,7 +57,7 @@
}
],
"detection_log": [
"extra-words"
"extra-words-permitted-in-rule"
],
"identifier": "bsd_new-fbfc5955-0c63-4c98-2ce9-08e1e1796f50"
}
Expand Down
Loading
Loading