Skip to content

Commit f13b100

Browse files
Add required phrases from other rules
Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
1 parent 966adde commit f13b100

File tree

14 files changed

+708
-307
lines changed

14 files changed

+708
-307
lines changed

docs/source/how-to-guides/add_new_license_detection_rule.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ More (advanced) rules options:
7373
be present in the result license detections. These just have the license text and a
7474
`is_false_positive` flag set to True.
7575

76-
- you can specify key phrases by surrounding one or more words between the `{{`
76+
- you can specify required phrases by surrounding one or more words between the `{{`
7777
and `}}` tags. Key phrases are words that **must** be matched/present in order
7878
for a RULE to be considered a match.
7979

etc/scripts/licenses/buildrules.py

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from licensedcode import models
1717
from licensedcode import match_hash
1818
from licensedcode import frontmatter
19+
from licensedcode.models import rule_exists
1920
from license_expression import Licensing
2021

2122
"""
@@ -129,23 +130,6 @@ def load_data(location="00-new-licenses.txt"):
129130
return rules
130131

131132

132-
def rule_exists(text):
133-
"""
134-
Return the matched rule identifier if the text is an existing rule matched
135-
exactly, False otherwise.
136-
"""
137-
idx = cache.get_index()
138-
139-
matches = idx.match(query_string=text)
140-
if not matches:
141-
return False
142-
if len(matches) > 1:
143-
return False
144-
match = matches[0]
145-
if match.matcher == match_hash.MATCH_HASH and match.score() == 100:
146-
return match.rule.identifier
147-
148-
149133
def all_rule_by_tokens():
150134
"""
151135
Return a mapping of {tuples of tokens: rule id}, with one item for each

etc/scripts/licenses/report_license_rules.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@
6262
"is_license_reference",
6363
"is_license_intro",
6464
"is_license_clue",
65+
"is_required_phrase",
66+
"skip_collecting_required_phrases",
6567
"is_deprecated",
6668
"has_unknown",
6769
"only_known_words",

src/licensedcode/match.py

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
TRACE_FILTER_BELOW_MIN_SCORE = False
5656
TRACE_FILTER_SINGLE_WORD_GIBBERISH = False
5757
TRACE_SET_LINES = False
58-
TRACE_KEY_PHRASES = False
58+
TRACE_REQUIRED_PHRASES = False
5959
TRACE_REGIONS = False
6060
TRACE_FILTER_LICENSE_LIST = False
6161
TRACE_FILTER_LICENSE_LIST_DETAILED = False
@@ -91,7 +91,7 @@ def logger_debug(*args): pass
9191
or TRACE_MATCHED_TEXT_DETAILS
9292
or TRACE_HIGHLIGHTED_TEXT
9393
or TRACE_FILTER_SINGLE_WORD_GIBBERISH
94-
or TRACE_KEY_PHRASES
94+
or TRACE_REQUIRED_PHRASES
9595
or TRACE_REGIONS
9696
or TRACE_FILTER_LICENSE_LIST
9797
or TRACE_FILTER_LICENSE_LIST_DETAILED
@@ -133,7 +133,7 @@ def _debug_print_matched_query_text(match, extras=5):
133133

134134
class DiscardReason(IntEnum):
135135
NOT_DISCARDED = 0
136-
MISSING_KEY_PHRASES = 1
136+
MISSING_REQUIRED_PHRASES = 1
137137
BELOW_MIN_COVERAGE = 2
138138
SPURIOUS_SINGLE_TOKEN = 3
139139
TOO_SHORT = 4
@@ -634,15 +634,15 @@ def combine(self, other):
634634
discard_reason = DiscardReason.NOT_DISCARDED
635635

636636
elif (
637-
self.discard_reason == DiscardReason.MISSING_KEY_PHRASES
638-
and other.discard_reason == DiscardReason.MISSING_KEY_PHRASES
637+
self.discard_reason == DiscardReason.MISSING_REQUIRED_PHRASES
638+
and other.discard_reason == DiscardReason.MISSING_REQUIRED_PHRASES
639639
):
640-
discard_reason = DiscardReason.MISSING_KEY_PHRASES
640+
discard_reason = DiscardReason.MISSING_REQUIRED_PHRASES
641641

642-
elif self.discard_reason == DiscardReason.MISSING_KEY_PHRASES:
642+
elif self.discard_reason == DiscardReason.MISSING_REQUIRED_PHRASES:
643643
discard_reason = other.discard_reason
644644

645-
elif other.discard_reason == DiscardReason.MISSING_KEY_PHRASES:
645+
elif other.discard_reason == DiscardReason.MISSING_REQUIRED_PHRASES:
646646
discard_reason = self.discard_reason
647647

648648
else:
@@ -2116,17 +2116,17 @@ def filter_false_positive_matches(
21162116
return kept, discarded
21172117

21182118

2119-
def filter_matches_missing_key_phrases(
2119+
def filter_matches_missing_required_phrases(
21202120
matches,
2121-
trace=TRACE_KEY_PHRASES,
2122-
reason=DiscardReason.MISSING_KEY_PHRASES,
2121+
trace=TRACE_REQUIRED_PHRASES,
2122+
reason=DiscardReason.MISSING_REQUIRED_PHRASES,
21232123
):
21242124
"""
21252125
Return a filtered list of kept LicenseMatch matches and a list of
21262126
discardable matches given a ``matches`` list of LicenseMatch by removing
2127-
all ``matches`` that do not contain all key phrases defined in their matched
2127+
all ``matches`` that do not contain all required phrases defined in their matched
21282128
rule.
2129-
A key phrase must be matched exactly without gaps or unknown words.
2129+
A required phrase must be matched exactly without gaps or unknown words.
21302130
21312131
A rule with "is_continuous" set to True is the same as if its whole text
21322132
was defined as a keyphrase and is processed here too.
@@ -2143,14 +2143,14 @@ def filter_matches_missing_key_phrases(
21432143
discarded_append = discarded.append
21442144

21452145
if trace:
2146-
logger_debug('filter_matches_missing_key_phrases')
2146+
logger_debug('filter_matches_missing_required_phrases')
21472147

21482148
for match in matches:
21492149
if trace:
21502150
logger_debug(' CHECKING KEY PHRASES for:', match)
21512151

21522152
is_continuous = match.rule.is_continuous
2153-
ikey_spans = match.rule.key_phrase_spans
2153+
ikey_spans = match.rule.required_phrase_spans
21542154

21552155
if not (ikey_spans or is_continuous):
21562156
kept_append(match)
@@ -2180,11 +2180,11 @@ def filter_matches_missing_key_phrases(
21802180
# use whole ispan in this case
21812181
ikey_spans = [match.ispan]
21822182

2183-
# keep matches as candidate if they contain all key phrase positions in the ispan
2183+
# keep matches as candidate if they contain all required phrase positions in the ispan
21842184
if trace:
21852185
print(' CANDIDATE TO KEEP: all ikey_span in match.ispan:', ikey_spans, ispan)
21862186

2187-
# discard matches that contain key phrases, but interrupted by
2187+
# discard matches that contain required phrases, but interrupted by
21882188
# unknown or stop words.
21892189

21902190
unknown_by_pos = match.query.unknowns_by_pos
@@ -2195,7 +2195,7 @@ def filter_matches_missing_key_phrases(
21952195
istopwords_by_pos = match.rule.stopwords_by_pos
21962196
istopwords_by_pos_get = istopwords_by_pos.get
21972197

2198-
# iterate on each key phrase span to ensure that they are continuous
2198+
# iterate on each required phrase span to ensure that they are continuous
21992199
# and contain no unknown words on the query side
22002200

22012201
is_valid = True
@@ -2204,7 +2204,7 @@ def filter_matches_missing_key_phrases(
22042204

22052205
for ikey_span in ikey_spans:
22062206

2207-
# check that are no gaps in the key phrase span on the query side
2207+
# check that are no gaps in the required phrase span on the query side
22082208
# BUT, do not redo the check for is_continuous already checked above
22092209
if is_continuous:
22102210
qkey_span = qspan
@@ -2225,13 +2225,13 @@ def filter_matches_missing_key_phrases(
22252225
is_valid = False
22262226
break
22272227

2228-
# check that key phrase spans does not contain stop words and does
2228+
# check that required phrase spans does not contain stop words and does
22292229
# not contain unknown words
22302230

2231-
# NOTE: we do not check the last qkey_span position of a key phrase
2231+
# NOTE: we do not check the last qkey_span position of a required phrase
22322232
# since unknown is a number of words after a given span position:
22332233
# these are pinned to the last position and we would not care for
2234-
# what unknown or stop words show up after a key phrase ends.
2234+
# what unknown or stop words show up after a required phrase ends.
22352235

22362236
qkey_span_end = qkey_span.end
22372237
contains_unknown = any(
@@ -2694,7 +2694,7 @@ def _log(_matches, _discarded, msg):
26942694
# FIXME: we should have only a single loop on all the matches at once!!
26952695
# and not 10's of loops!!!
26962696

2697-
matches, discarded = filter_matches_missing_key_phrases(matches)
2697+
matches, discarded = filter_matches_missing_required_phrases(matches)
26982698
all_discarded_extend(discarded)
26992699
_log(matches, discarded, 'HAS KEY PHRASES')
27002700

0 commit comments

Comments
 (0)